claude-scientific-skills/scientific-skills/geomaster/references/big-data.md

# Big Data and Cloud Computing

Distributed processing, cloud platforms, and GPU acceleration for geospatial data.

## Distributed Processing with Dask

### Dask-GeoPandas

```python
import dask_geopandas
import geopandas as gpd
import dask.dataframe as dd

# Read large GeoPackage in chunks
dask_gdf = dask_geopandas.read_file('large.gpkg', npartitions=10)

# Perform spatial operations
dask_gdf['area'] = dask_gdf.geometry.area
dask_gdf['buffer'] = dask_gdf.geometry.buffer(1000)

# Compute result
result = dask_gdf.compute()

# Distributed spatial join
dask_points = dask_geopandas.read_file('points.gpkg', npartitions=5)
dask_zones = dask_geopandas.read_file('zones.gpkg', npartitions=3)

joined = dask_points.sjoin(dask_zones, how='inner', predicate='within')
result = joined.compute()
```

### Dask for Raster Processing

```python
import dask.array as da
import rasterio

# Create lazy-loaded raster array
def lazy_raster(path, chunks=(1, 1024, 1024)):
    with rasterio.open(path) as src:
        profile = src.profile
        # Create dask array
        raster = da.from_rasterio(src, chunks=chunks)

    return raster, profile

# Process large raster
raster, profile = lazy_raster('very_large.tif')

# Calculate NDVI (lazy operation)
ndvi = (raster[3] - raster[2]) / (raster[3] + raster[2] + 1e-8)

# Apply function to each chunk
def process_chunk(chunk):
    return (chunk - chunk.min()) / (chunk.max() - chunk.min())

normalized = da.map_blocks(process_chunk, ndvi, dtype=np.float32)

# Compute and save
with rasterio.open('output.tif', 'w', **profile) as dst:
    dst.write(normalized.compute())
```

### Dask Distributed Cluster

```python
from dask.distributed import Client

# Connect to cluster
client = Client('scheduler-address:8786')

# Or create local cluster
from dask.distributed import LocalCluster
cluster = LocalCluster(n_workers=4, threads_per_worker=2, memory_limit='4GB')
client = Client(cluster)

# Use Dask-GeoPandas with cluster
dask_gdf = dask_geopandas.from_geopandas(gdf, npartitions=10)
dask_gdf = dask_gdf.set_index(calculate_spatial_partitions=True)

# Operations are now distributed
result = dask_gdf.buffer(1000).compute()
```

## Cloud Platforms

### Google Earth Engine

```python
import ee

# Initialize
ee.Initialize(project='your-project')

# Large-scale composite
def create_annual_composite(year):
    """Create cloud-free annual composite."""

    # Sentinel-2 collection
    s2 = ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED') \
        .filterBounds(ee.Geometry.Rectangle([-125, 32, -114, 42])) \
        .filterDate(f'{year}-01-01', f'{year}-12-31') \
        .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 20))

    # Cloud masking
    def mask_s2(image):
        qa = image.select('QA60')
        cloud_bit_mask = 1 << 10
        cirrus_bit_mask = 1 << 11
        mask = qa.bitwiseAnd(cloud_bit_mask).eq(0).And(
               qa.bitwiseAnd(cirrus_bit_mask).eq(0))
        return image.updateMask(mask.Not())

    s2_masked = s2.map(mask_s2)

    # Median composite
    composite = s2_masked.median().clip(roi)

    return composite

# Export to Google Drive
task = ee.batch.Export.image.toDrive(
    image=composite,
    description='CA_composite_2023',
    scale=10,
    region=roi,
    crs='EPSG:32611',
    maxPixels=1e13
)
task.start()
```

### Planetary Computer (Microsoft)

```python
import pystac_client
import planetary_computer
import odc.stac
import xarray as xr

# Search catalog
catalog = pystac_client.Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1",
    modifier=planetary_computer.sign_inplace,
)

# Search NAIP imagery
search = catalog.search(
    collections=["naip"],
    bbox=[-125, 32, -114, 42],
    datetime="2020-01-01/2023-12-31",
)

items = list(search.get_items())

# Load as xarray dataset
data = odc.stac.load(
    items[:100],  # Process in batches
    bands=["image"],
    crs="EPSG:32611",
    resolution=1.0,
    chunkx=1024,
    chunky=1024,
)

# Compute statistics lazily
mean = data.mean().compute()
std = data.std().compute()

# Export to COG
import rioxarray
data.isel(time=0).rio.to_raster('naip_composite.tif', compress='DEFLATE')
```

### Google Cloud Storage

```python
from google.cloud import storage
import rasterio
from rasterio.session import GSSession

# Upload to GCS
client = storage.Client()
bucket = client.bucket('my-bucket')
blob = bucket.blob('geospatial/data.tif')
blob.upload_from_filename('local_data.tif')

# Read directly from GCS
with rasterio.open(
    'gs://my-bucket/geospatial/data.tif',
    session=GSSession()
) as src:
    data = src.read()

# Use with Rioxarray
import rioxarray
da = rioxarray.open_rasterio('gs://my-bucket/geospatial/data.tif')
```

## GPU Acceleration

### CuPy for Raster Processing

```python
import cupy as cp
import numpy as np

def gpu_ndvi(nir, red):
    """Calculate NDVI on GPU."""
    # Transfer to GPU
    nir_gpu = cp.asarray(nir)
    red_gpu = cp.asarray(red)

    # Calculate on GPU
    ndvi_gpu = (nir_gpu - red_gpu) / (nir_gpu + red_gpu + 1e-8)

    # Transfer back
    return cp.asnumpy(ndvi_gpu)

# Batch processing
def batch_process_gpu(raster_path):
    with rasterio.open(raster_path) as src:
        data = src.read()  # (bands, height, width)

    data_gpu = cp.asarray(data)

    # Process all bands
    for i in range(data.shape[0]):
        data_gpu[i] = (data_gpu[i] - data_gpu[i].min()) / \
                      (data_gpu[i].max() - data_gpu[i].min())

    return cp.asnumpy(data_gpu)
```

### RAPIDS for Spatial Analysis

```python
import cudf
import cuspatial

# Load data to GPU
gdf_gpu = cuspatial.from_geopandas(gdf)

# Spatial join on GPU
points_gpu = cuspatial.from_geopandas(points_gdf)
polygons_gpu = cuspatial.from_geopandas(polygons_gdf)

joined = cuspatial.join_polygon_points(
    polygons_gpu,
    points_gpu
)

# Convert back
result = joined.to_pandas()
```

### PyTorch for Geospatial Deep Learning

```python
import torch
from torch.utils.data import DataLoader

# Custom dataset
class SatelliteDataset(torch.utils.data.Dataset):
    def __init__(self, image_paths, label_paths):
        self.image_paths = image_paths
        self.label_paths = label_paths

    def __getitem__(self, idx):
        with rasterio.open(self.image_paths[idx]) as src:
            image = src.read().astype(np.float32)

        with rasterio.open(self.label_paths[idx]) as src:
            label = src.read(1).astype(np.int64)

        return torch.from_numpy(image), torch.from_numpy(label)

# DataLoader with GPU prefetching
dataset = SatelliteDataset(images, labels)
loader = DataLoader(
    dataset,
    batch_size=16,
    shuffle=True,
    num_workers=4,
    pin_memory=True,  # Faster transfer to GPU
)

# Training with mixed precision
from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()

for images, labels in loader:
    images, labels = images.to('cuda'), labels.to('cuda')

    with autocast():
        outputs = model(images)
        loss = criterion(outputs, labels)

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
```

## Efficient Data Formats

### Cloud-Optimized GeoTIFF (COG)

```python
from rio_cogeo.cogeo import cog_translate

# Convert to COG
cog_translate(
    src_path='input.tif',
    dst_path='output_cog.tif',
    dst_kwds={'compress': 'DEFLATE', 'predictor': 2},
    overview_level=5,
    overview_resampling='average',
    config={'GDAL_TIFF_INTERNAL_MASK': True}
)

# Create overviews for faster access
with rasterio.open('output.tif', 'r+') as src:
    src.build_overviews([2, 4, 8, 16], resampling='average')
    src.update_tags(ns='rio_overview', resampling='average')
```

### Zarr for Multidimensional Arrays

```python
import xarray as xr
import zarr

# Create Zarr store
store = zarr.DirectoryStore('data.zarr')

# Save datacube to Zarr
ds.to_zarr(store, consolidated=True)

# Read efficiently
ds = xr.open_zarr('data.zarr', consolidated=True)

# Extract subset efficiently
subset = ds.sel(time='2023-01', latitude=slice(30, 40))
```

### Parquet for Vector Data

```python
import geopandas as gpd

# Write to Parquet (with spatial index)
gdf.to_parquet('data.parquet', compression='snappy', index=True)

# Read efficiently
gdf = gpd.read_parquet('data.parquet')

# Read subset with filtering
import pyarrow.parquet as pq
table = pq.read_table('data.parquet', filters=[('column', '==', 'value')])
```

For more big data examples, see [code-examples.md](code-examples.md).