Add GeoMaster: Comprehensive Geospatial Science Skill

- Added SKILL.md with installation, quick start, core concepts, workflows - Added 12 reference documentation files covering 70+ topics - Includes 500+ code examples across 7 programming languages - Covers remote sensing, GIS, ML/AI, 30+ scientific domains - MIT License Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Co-Authored-By: Dr. Umair Rabbani <umairrs@gmail.com>
2026-03-27 07:09:27 +08:00 · 2026-03-01 13:42:41 +05:00
parent 29c869326e
commit 4787f98d98
14 changed files with 5526 additions and 0 deletions
--- a/scientific-skills/geomaster/references/big-data.md
+++ b/scientific-skills/geomaster/references/big-data.md
@@ -0,0 +1,363 @@
+# Big Data and Cloud Computing
+
+Distributed processing, cloud platforms, and GPU acceleration for geospatial data.
+
+## Distributed Processing with Dask
+
+### Dask-GeoPandas
+
+```python
+import dask_geopandas
+import geopandas as gpd
+import dask.dataframe as dd
+
+# Read large GeoPackage in chunks
+dask_gdf = dask_geopandas.read_file('large.gpkg', npartitions=10)
+
+# Perform spatial operations
+dask_gdf['area'] = dask_gdf.geometry.area
+dask_gdf['buffer'] = dask_gdf.geometry.buffer(1000)
+
+# Compute result
+result = dask_gdf.compute()
+
+# Distributed spatial join
+dask_points = dask_geopandas.read_file('points.gpkg', npartitions=5)
+dask_zones = dask_geopandas.read_file('zones.gpkg', npartitions=3)
+
+joined = dask_points.sjoin(dask_zones, how='inner', predicate='within')
+result = joined.compute()
+```
+
+### Dask for Raster Processing
+
+```python
+import dask.array as da
+import rasterio
+
+# Create lazy-loaded raster array
+def lazy_raster(path, chunks=(1, 1024, 1024)):
+    with rasterio.open(path) as src:
+        profile = src.profile
+        # Create dask array
+        raster = da.from_rasterio(src, chunks=chunks)
+
+    return raster, profile
+
+# Process large raster
+raster, profile = lazy_raster('very_large.tif')
+
+# Calculate NDVI (lazy operation)
+ndvi = (raster[3] - raster[2]) / (raster[3] + raster[2] + 1e-8)
+
+# Apply function to each chunk
+def process_chunk(chunk):
+    return (chunk - chunk.min()) / (chunk.max() - chunk.min())
+
+normalized = da.map_blocks(process_chunk, ndvi, dtype=np.float32)
+
+# Compute and save
+with rasterio.open('output.tif', 'w', **profile) as dst:
+    dst.write(normalized.compute())
+```
+
+### Dask Distributed Cluster
+
+```python
+from dask.distributed import Client
+
+# Connect to cluster
+client = Client('scheduler-address:8786')
+
+# Or create local cluster
+from dask.distributed import LocalCluster
+cluster = LocalCluster(n_workers=4, threads_per_worker=2, memory_limit='4GB')
+client = Client(cluster)
+
+# Use Dask-GeoPandas with cluster
+dask_gdf = dask_geopandas.from_geopandas(gdf, npartitions=10)
+dask_gdf = dask_gdf.set_index(calculate_spatial_partitions=True)
+
+# Operations are now distributed
+result = dask_gdf.buffer(1000).compute()
+```
+
+## Cloud Platforms
+
+### Google Earth Engine
+
+```python
+import ee
+
+# Initialize
+ee.Initialize(project='your-project')
+
+# Large-scale composite
+def create_annual_composite(year):
+    """Create cloud-free annual composite."""
+
+    # Sentinel-2 collection
+    s2 = ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED') \
+        .filterBounds(ee.Geometry.Rectangle([-125, 32, -114, 42])) \
+        .filterDate(f'{year}-01-01', f'{year}-12-31') \
+        .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 20))
+
+    # Cloud masking
+    def mask_s2(image):
+        qa = image.select('QA60')
+        cloud_bit_mask = 1 << 10
+        cirrus_bit_mask = 1 << 11
+        mask = qa.bitwiseAnd(cloud_bit_mask).eq(0).And(
+               qa.bitwiseAnd(cirrus_bit_mask).eq(0))
+        return image.updateMask(mask.Not())
+
+    s2_masked = s2.map(mask_s2)
+
+    # Median composite
+    composite = s2_masked.median().clip(roi)
+
+    return composite
+
+# Export to Google Drive
+task = ee.batch.Export.image.toDrive(
+    image=composite,
+    description='CA_composite_2023',
+    scale=10,
+    region=roi,
+    crs='EPSG:32611',
+    maxPixels=1e13
+)
+task.start()
+```
+
+### Planetary Computer (Microsoft)
+
+```python
+import pystac_client
+import planetary_computer
+import odc.stac
+import xarray as xr
+
+# Search catalog
+catalog = pystac_client.Client.open(
+    "https://planetarycomputer.microsoft.com/api/stac/v1",
+    modifier=planetary_computer.sign_inplace,
+)
+
+# Search NAIP imagery
+search = catalog.search(
+    collections=["naip"],
+    bbox=[-125, 32, -114, 42],
+    datetime="2020-01-01/2023-12-31",
+)
+
+items = list(search.get_items())
+
+# Load as xarray dataset
+data = odc.stac.load(
+    items[:100],  # Process in batches
+    bands=["image"],
+    crs="EPSG:32611",
+    resolution=1.0,
+    chunkx=1024,
+    chunky=1024,
+)
+
+# Compute statistics lazily
+mean = data.mean().compute()
+std = data.std().compute()
+
+# Export to COG
+import rioxarray
+data.isel(time=0).rio.to_raster('naip_composite.tif', compress='DEFLATE')
+```
+
+### Google Cloud Storage
+
+```python
+from google.cloud import storage
+import rasterio
+from rasterio.session import GSSession
+
+# Upload to GCS
+client = storage.Client()
+bucket = client.bucket('my-bucket')
+blob = bucket.blob('geospatial/data.tif')
+blob.upload_from_filename('local_data.tif')
+
+# Read directly from GCS
+with rasterio.open(
+    'gs://my-bucket/geospatial/data.tif',
+    session=GSSession()
+) as src:
+    data = src.read()
+
+# Use with Rioxarray
+import rioxarray
+da = rioxarray.open_rasterio('gs://my-bucket/geospatial/data.tif')
+```
+
+## GPU Acceleration
+
+### CuPy for Raster Processing
+
+```python
+import cupy as cp
+import numpy as np
+
+def gpu_ndvi(nir, red):
+    """Calculate NDVI on GPU."""
+    # Transfer to GPU
+    nir_gpu = cp.asarray(nir)
+    red_gpu = cp.asarray(red)
+
+    # Calculate on GPU
+    ndvi_gpu = (nir_gpu - red_gpu) / (nir_gpu + red_gpu + 1e-8)
+
+    # Transfer back
+    return cp.asnumpy(ndvi_gpu)
+
+# Batch processing
+def batch_process_gpu(raster_path):
+    with rasterio.open(raster_path) as src:
+        data = src.read()  # (bands, height, width)
+
+    data_gpu = cp.asarray(data)
+
+    # Process all bands
+    for i in range(data.shape[0]):
+        data_gpu[i] = (data_gpu[i] - data_gpu[i].min()) / \
+                      (data_gpu[i].max() - data_gpu[i].min())
+
+    return cp.asnumpy(data_gpu)
+```
+
+### RAPIDS for Spatial Analysis
+
+```python
+import cudf
+import cuspatial
+
+# Load data to GPU
+gdf_gpu = cuspatial.from_geopandas(gdf)
+
+# Spatial join on GPU
+points_gpu = cuspatial.from_geopandas(points_gdf)
+polygons_gpu = cuspatial.from_geopandas(polygons_gdf)
+
+joined = cuspatial.join_polygon_points(
+    polygons_gpu,
+    points_gpu
+)
+
+# Convert back
+result = joined.to_pandas()
+```
+
+### PyTorch for Geospatial Deep Learning
+
+```python
+import torch
+from torch.utils.data import DataLoader
+
+# Custom dataset
+class SatelliteDataset(torch.utils.data.Dataset):
+    def __init__(self, image_paths, label_paths):
+        self.image_paths = image_paths
+        self.label_paths = label_paths
+
+    def __getitem__(self, idx):
+        with rasterio.open(self.image_paths[idx]) as src:
+            image = src.read().astype(np.float32)
+
+        with rasterio.open(self.label_paths[idx]) as src:
+            label = src.read(1).astype(np.int64)
+
+        return torch.from_numpy(image), torch.from_numpy(label)
+
+# DataLoader with GPU prefetching
+dataset = SatelliteDataset(images, labels)
+loader = DataLoader(
+    dataset,
+    batch_size=16,
+    shuffle=True,
+    num_workers=4,
+    pin_memory=True,  # Faster transfer to GPU
+)
+
+# Training with mixed precision
+from torch.cuda.amp import autocast, GradScaler
+
+scaler = GradScaler()
+
+for images, labels in loader:
+    images, labels = images.to('cuda'), labels.to('cuda')
+
+    with autocast():
+        outputs = model(images)
+        loss = criterion(outputs, labels)
+
+    scaler.scale(loss).backward()
+    scaler.step(optimizer)
+    scaler.update()
+```
+
+## Efficient Data Formats
+
+### Cloud-Optimized GeoTIFF (COG)
+
+```python
+from rio_cogeo.cogeo import cog_translate
+
+# Convert to COG
+cog_translate(
+    src_path='input.tif',
+    dst_path='output_cog.tif',
+    dst_kwds={'compress': 'DEFLATE', 'predictor': 2},
+    overview_level=5,
+    overview_resampling='average',
+    config={'GDAL_TIFF_INTERNAL_MASK': True}
+)
+
+# Create overviews for faster access
+with rasterio.open('output.tif', 'r+') as src:
+    src.build_overviews([2, 4, 8, 16], resampling='average')
+    src.update_tags(ns='rio_overview', resampling='average')
+```
+
+### Zarr for Multidimensional Arrays
+
+```python
+import xarray as xr
+import zarr
+
+# Create Zarr store
+store = zarr.DirectoryStore('data.zarr')
+
+# Save datacube to Zarr
+ds.to_zarr(store, consolidated=True)
+
+# Read efficiently
+ds = xr.open_zarr('data.zarr', consolidated=True)
+
+# Extract subset efficiently
+subset = ds.sel(time='2023-01', latitude=slice(30, 40))
+```
+
+### Parquet for Vector Data
+
+```python
+import geopandas as gpd
+
+# Write to Parquet (with spatial index)
+gdf.to_parquet('data.parquet', compression='snappy', index=True)
+
+# Read efficiently
+gdf = gpd.read_parquet('data.parquet')
+
+# Read subset with filtering
+import pyarrow.parquet as pq
+table = pq.read_table('data.parquet', filters=[('column', '==', 'value')])
+```
+
+For more big data examples, see [code-examples.md](code-examples.md).