mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-03-27 07:09:27 +08:00
- Added SKILL.md with installation, quick start, core concepts, workflows - Added 12 reference documentation files covering 70+ topics - Includes 500+ code examples across 7 programming languages - Covers remote sensing, GIS, ML/AI, 30+ scientific domains - MIT License Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Co-Authored-By: Dr. Umair Rabbani <umairrs@gmail.com>
364 lines
8.6 KiB
Markdown
364 lines
8.6 KiB
Markdown
# Big Data and Cloud Computing
|
|
|
|
Distributed processing, cloud platforms, and GPU acceleration for geospatial data.
|
|
|
|
## Distributed Processing with Dask
|
|
|
|
### Dask-GeoPandas
|
|
|
|
```python
|
|
import dask_geopandas
|
|
import geopandas as gpd
|
|
import dask.dataframe as dd
|
|
|
|
# Read large GeoPackage in chunks
|
|
dask_gdf = dask_geopandas.read_file('large.gpkg', npartitions=10)
|
|
|
|
# Perform spatial operations
|
|
dask_gdf['area'] = dask_gdf.geometry.area
|
|
dask_gdf['buffer'] = dask_gdf.geometry.buffer(1000)
|
|
|
|
# Compute result
|
|
result = dask_gdf.compute()
|
|
|
|
# Distributed spatial join
|
|
dask_points = dask_geopandas.read_file('points.gpkg', npartitions=5)
|
|
dask_zones = dask_geopandas.read_file('zones.gpkg', npartitions=3)
|
|
|
|
joined = dask_points.sjoin(dask_zones, how='inner', predicate='within')
|
|
result = joined.compute()
|
|
```
|
|
|
|
### Dask for Raster Processing
|
|
|
|
```python
|
|
import dask.array as da
|
|
import rasterio
|
|
|
|
# Create lazy-loaded raster array
|
|
def lazy_raster(path, chunks=(1, 1024, 1024)):
|
|
with rasterio.open(path) as src:
|
|
profile = src.profile
|
|
# Create dask array
|
|
raster = da.from_rasterio(src, chunks=chunks)
|
|
|
|
return raster, profile
|
|
|
|
# Process large raster
|
|
raster, profile = lazy_raster('very_large.tif')
|
|
|
|
# Calculate NDVI (lazy operation)
|
|
ndvi = (raster[3] - raster[2]) / (raster[3] + raster[2] + 1e-8)
|
|
|
|
# Apply function to each chunk
|
|
def process_chunk(chunk):
|
|
return (chunk - chunk.min()) / (chunk.max() - chunk.min())
|
|
|
|
normalized = da.map_blocks(process_chunk, ndvi, dtype=np.float32)
|
|
|
|
# Compute and save
|
|
with rasterio.open('output.tif', 'w', **profile) as dst:
|
|
dst.write(normalized.compute())
|
|
```
|
|
|
|
### Dask Distributed Cluster
|
|
|
|
```python
|
|
from dask.distributed import Client
|
|
|
|
# Connect to cluster
|
|
client = Client('scheduler-address:8786')
|
|
|
|
# Or create local cluster
|
|
from dask.distributed import LocalCluster
|
|
cluster = LocalCluster(n_workers=4, threads_per_worker=2, memory_limit='4GB')
|
|
client = Client(cluster)
|
|
|
|
# Use Dask-GeoPandas with cluster
|
|
dask_gdf = dask_geopandas.from_geopandas(gdf, npartitions=10)
|
|
dask_gdf = dask_gdf.set_index(calculate_spatial_partitions=True)
|
|
|
|
# Operations are now distributed
|
|
result = dask_gdf.buffer(1000).compute()
|
|
```
|
|
|
|
## Cloud Platforms
|
|
|
|
### Google Earth Engine
|
|
|
|
```python
|
|
import ee
|
|
|
|
# Initialize
|
|
ee.Initialize(project='your-project')
|
|
|
|
# Large-scale composite
|
|
def create_annual_composite(year):
|
|
"""Create cloud-free annual composite."""
|
|
|
|
# Sentinel-2 collection
|
|
s2 = ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED') \
|
|
.filterBounds(ee.Geometry.Rectangle([-125, 32, -114, 42])) \
|
|
.filterDate(f'{year}-01-01', f'{year}-12-31') \
|
|
.filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 20))
|
|
|
|
# Cloud masking
|
|
def mask_s2(image):
|
|
qa = image.select('QA60')
|
|
cloud_bit_mask = 1 << 10
|
|
cirrus_bit_mask = 1 << 11
|
|
mask = qa.bitwiseAnd(cloud_bit_mask).eq(0).And(
|
|
qa.bitwiseAnd(cirrus_bit_mask).eq(0))
|
|
return image.updateMask(mask.Not())
|
|
|
|
s2_masked = s2.map(mask_s2)
|
|
|
|
# Median composite
|
|
composite = s2_masked.median().clip(roi)
|
|
|
|
return composite
|
|
|
|
# Export to Google Drive
|
|
task = ee.batch.Export.image.toDrive(
|
|
image=composite,
|
|
description='CA_composite_2023',
|
|
scale=10,
|
|
region=roi,
|
|
crs='EPSG:32611',
|
|
maxPixels=1e13
|
|
)
|
|
task.start()
|
|
```
|
|
|
|
### Planetary Computer (Microsoft)
|
|
|
|
```python
|
|
import pystac_client
|
|
import planetary_computer
|
|
import odc.stac
|
|
import xarray as xr
|
|
|
|
# Search catalog
|
|
catalog = pystac_client.Client.open(
|
|
"https://planetarycomputer.microsoft.com/api/stac/v1",
|
|
modifier=planetary_computer.sign_inplace,
|
|
)
|
|
|
|
# Search NAIP imagery
|
|
search = catalog.search(
|
|
collections=["naip"],
|
|
bbox=[-125, 32, -114, 42],
|
|
datetime="2020-01-01/2023-12-31",
|
|
)
|
|
|
|
items = list(search.get_items())
|
|
|
|
# Load as xarray dataset
|
|
data = odc.stac.load(
|
|
items[:100], # Process in batches
|
|
bands=["image"],
|
|
crs="EPSG:32611",
|
|
resolution=1.0,
|
|
chunkx=1024,
|
|
chunky=1024,
|
|
)
|
|
|
|
# Compute statistics lazily
|
|
mean = data.mean().compute()
|
|
std = data.std().compute()
|
|
|
|
# Export to COG
|
|
import rioxarray
|
|
data.isel(time=0).rio.to_raster('naip_composite.tif', compress='DEFLATE')
|
|
```
|
|
|
|
### Google Cloud Storage
|
|
|
|
```python
|
|
from google.cloud import storage
|
|
import rasterio
|
|
from rasterio.session import GSSession
|
|
|
|
# Upload to GCS
|
|
client = storage.Client()
|
|
bucket = client.bucket('my-bucket')
|
|
blob = bucket.blob('geospatial/data.tif')
|
|
blob.upload_from_filename('local_data.tif')
|
|
|
|
# Read directly from GCS
|
|
with rasterio.open(
|
|
'gs://my-bucket/geospatial/data.tif',
|
|
session=GSSession()
|
|
) as src:
|
|
data = src.read()
|
|
|
|
# Use with Rioxarray
|
|
import rioxarray
|
|
da = rioxarray.open_rasterio('gs://my-bucket/geospatial/data.tif')
|
|
```
|
|
|
|
## GPU Acceleration
|
|
|
|
### CuPy for Raster Processing
|
|
|
|
```python
|
|
import cupy as cp
|
|
import numpy as np
|
|
|
|
def gpu_ndvi(nir, red):
|
|
"""Calculate NDVI on GPU."""
|
|
# Transfer to GPU
|
|
nir_gpu = cp.asarray(nir)
|
|
red_gpu = cp.asarray(red)
|
|
|
|
# Calculate on GPU
|
|
ndvi_gpu = (nir_gpu - red_gpu) / (nir_gpu + red_gpu + 1e-8)
|
|
|
|
# Transfer back
|
|
return cp.asnumpy(ndvi_gpu)
|
|
|
|
# Batch processing
|
|
def batch_process_gpu(raster_path):
|
|
with rasterio.open(raster_path) as src:
|
|
data = src.read() # (bands, height, width)
|
|
|
|
data_gpu = cp.asarray(data)
|
|
|
|
# Process all bands
|
|
for i in range(data.shape[0]):
|
|
data_gpu[i] = (data_gpu[i] - data_gpu[i].min()) / \
|
|
(data_gpu[i].max() - data_gpu[i].min())
|
|
|
|
return cp.asnumpy(data_gpu)
|
|
```
|
|
|
|
### RAPIDS for Spatial Analysis
|
|
|
|
```python
|
|
import cudf
|
|
import cuspatial
|
|
|
|
# Load data to GPU
|
|
gdf_gpu = cuspatial.from_geopandas(gdf)
|
|
|
|
# Spatial join on GPU
|
|
points_gpu = cuspatial.from_geopandas(points_gdf)
|
|
polygons_gpu = cuspatial.from_geopandas(polygons_gdf)
|
|
|
|
joined = cuspatial.join_polygon_points(
|
|
polygons_gpu,
|
|
points_gpu
|
|
)
|
|
|
|
# Convert back
|
|
result = joined.to_pandas()
|
|
```
|
|
|
|
### PyTorch for Geospatial Deep Learning
|
|
|
|
```python
|
|
import torch
|
|
from torch.utils.data import DataLoader
|
|
|
|
# Custom dataset
|
|
class SatelliteDataset(torch.utils.data.Dataset):
|
|
def __init__(self, image_paths, label_paths):
|
|
self.image_paths = image_paths
|
|
self.label_paths = label_paths
|
|
|
|
def __getitem__(self, idx):
|
|
with rasterio.open(self.image_paths[idx]) as src:
|
|
image = src.read().astype(np.float32)
|
|
|
|
with rasterio.open(self.label_paths[idx]) as src:
|
|
label = src.read(1).astype(np.int64)
|
|
|
|
return torch.from_numpy(image), torch.from_numpy(label)
|
|
|
|
# DataLoader with GPU prefetching
|
|
dataset = SatelliteDataset(images, labels)
|
|
loader = DataLoader(
|
|
dataset,
|
|
batch_size=16,
|
|
shuffle=True,
|
|
num_workers=4,
|
|
pin_memory=True, # Faster transfer to GPU
|
|
)
|
|
|
|
# Training with mixed precision
|
|
from torch.cuda.amp import autocast, GradScaler
|
|
|
|
scaler = GradScaler()
|
|
|
|
for images, labels in loader:
|
|
images, labels = images.to('cuda'), labels.to('cuda')
|
|
|
|
with autocast():
|
|
outputs = model(images)
|
|
loss = criterion(outputs, labels)
|
|
|
|
scaler.scale(loss).backward()
|
|
scaler.step(optimizer)
|
|
scaler.update()
|
|
```
|
|
|
|
## Efficient Data Formats
|
|
|
|
### Cloud-Optimized GeoTIFF (COG)
|
|
|
|
```python
|
|
from rio_cogeo.cogeo import cog_translate
|
|
|
|
# Convert to COG
|
|
cog_translate(
|
|
src_path='input.tif',
|
|
dst_path='output_cog.tif',
|
|
dst_kwds={'compress': 'DEFLATE', 'predictor': 2},
|
|
overview_level=5,
|
|
overview_resampling='average',
|
|
config={'GDAL_TIFF_INTERNAL_MASK': True}
|
|
)
|
|
|
|
# Create overviews for faster access
|
|
with rasterio.open('output.tif', 'r+') as src:
|
|
src.build_overviews([2, 4, 8, 16], resampling='average')
|
|
src.update_tags(ns='rio_overview', resampling='average')
|
|
```
|
|
|
|
### Zarr for Multidimensional Arrays
|
|
|
|
```python
|
|
import xarray as xr
|
|
import zarr
|
|
|
|
# Create Zarr store
|
|
store = zarr.DirectoryStore('data.zarr')
|
|
|
|
# Save datacube to Zarr
|
|
ds.to_zarr(store, consolidated=True)
|
|
|
|
# Read efficiently
|
|
ds = xr.open_zarr('data.zarr', consolidated=True)
|
|
|
|
# Extract subset efficiently
|
|
subset = ds.sel(time='2023-01', latitude=slice(30, 40))
|
|
```
|
|
|
|
### Parquet for Vector Data
|
|
|
|
```python
|
|
import geopandas as gpd
|
|
|
|
# Write to Parquet (with spatial index)
|
|
gdf.to_parquet('data.parquet', compression='snappy', index=True)
|
|
|
|
# Read efficiently
|
|
gdf = gpd.read_parquet('data.parquet')
|
|
|
|
# Read subset with filtering
|
|
import pyarrow.parquet as pq
|
|
table = pq.read_table('data.parquet', filters=[('column', '==', 'value')])
|
|
```
|
|
|
|
For more big data examples, see [code-examples.md](code-examples.md).
|