diff --git a/README.md b/README.md deleted file mode 100644 index 9e90fd7..0000000 --- a/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# generate-image - -generate-image - Scientific Skill \ No newline at end of file diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..12dff2d --- /dev/null +++ b/SKILL.md @@ -0,0 +1,178 @@ +--- +name: generate-image +description: Generate or edit images using AI models (FLUX, Gemini). Use for general-purpose image generation including photos, illustrations, artwork, visual assets, concept art, and any image that isn't a technical diagram or schematic. For flowcharts, circuits, pathways, and technical diagrams, use the scientific-schematics skill instead. +--- + +# Generate Image + +Generate and edit high-quality images using OpenRouter's image generation models including FLUX.2 Pro and Gemini 3 Pro. + +## When to Use This Skill + +**Use generate-image for:** +- Photos and photorealistic images +- Artistic illustrations and artwork +- Concept art and visual concepts +- Visual assets for presentations or documents +- Image editing and modifications +- Any general-purpose image generation needs + +**Use scientific-schematics instead for:** +- Flowcharts and process diagrams +- Circuit diagrams and electrical schematics +- Biological pathways and signaling cascades +- System architecture diagrams +- CONSORT diagrams and methodology flowcharts +- Any technical/schematic diagrams + +## Quick Start + +Use the `scripts/generate_image.py` script to generate or edit images: + +```bash +# Generate a new image +python scripts/generate_image.py "A beautiful sunset over mountains" + +# Edit an existing image +python scripts/generate_image.py "Make the sky purple" --input photo.jpg +``` + +This generates/edits an image and saves it as `generated_image.png` in the current directory. + +## API Key Setup + +**CRITICAL**: The script requires an OpenRouter API key. Before running, check if the user has configured their API key: + +1. Look for a `.env` file in the project directory or parent directories +2. Check for `OPENROUTER_API_KEY=` in the `.env` file +3. If not found, inform the user they need to: + - Create a `.env` file with `OPENROUTER_API_KEY=your-api-key-here` + - Or set the environment variable: `export OPENROUTER_API_KEY=your-api-key-here` + - Get an API key from: https://openrouter.ai/keys + +The script will automatically detect the `.env` file and provide clear error messages if the API key is missing. + +## Model Selection + +**Default model**: `google/gemini-3-pro-image-preview` (high quality, recommended) + +**Available models for generation and editing**: +- `google/gemini-3-pro-image-preview` - High quality, supports generation + editing +- `black-forest-labs/flux.2-pro` - Fast, high quality, supports generation + editing + +**Generation only**: +- `black-forest-labs/flux.2-flex` - Fast and cheap, but not as high quality as pro + +Select based on: +- **Quality**: Use gemini-3-pro or flux.2-pro +- **Editing**: Use gemini-3-pro or flux.2-pro (both support image editing) +- **Cost**: Use flux.2-flex for generation only + +## Common Usage Patterns + +### Basic generation +```bash +python scripts/generate_image.py "Your prompt here" +``` + +### Specify model +```bash +python scripts/generate_image.py "A cat in space" --model "black-forest-labs/flux.2-pro" +``` + +### Custom output path +```bash +python scripts/generate_image.py "Abstract art" --output artwork.png +``` + +### Edit an existing image +```bash +python scripts/generate_image.py "Make the background blue" --input photo.jpg +``` + +### Edit with a specific model +```bash +python scripts/generate_image.py "Add sunglasses to the person" --input portrait.png --model "black-forest-labs/flux.2-pro" +``` + +### Edit with custom output +```bash +python scripts/generate_image.py "Remove the text from the image" --input screenshot.png --output cleaned.png +``` + +### Multiple images +Run the script multiple times with different prompts or output paths: +```bash +python scripts/generate_image.py "Image 1 description" --output image1.png +python scripts/generate_image.py "Image 2 description" --output image2.png +``` + +## Script Parameters + +- `prompt` (required): Text description of the image to generate, or editing instructions +- `--input` or `-i`: Input image path for editing (enables edit mode) +- `--model` or `-m`: OpenRouter model ID (default: google/gemini-3-pro-image-preview) +- `--output` or `-o`: Output file path (default: generated_image.png) +- `--api-key`: OpenRouter API key (overrides .env file) + +## Example Use Cases + +### For Scientific Documents +```bash +# Generate a conceptual illustration for a paper +python scripts/generate_image.py "Microscopic view of cancer cells being attacked by immunotherapy agents, scientific illustration style" --output figures/immunotherapy_concept.png + +# Create a visual for a presentation +python scripts/generate_image.py "DNA double helix structure with highlighted mutation site, modern scientific visualization" --output slides/dna_mutation.png +``` + +### For Presentations and Posters +```bash +# Title slide background +python scripts/generate_image.py "Abstract blue and white background with subtle molecular patterns, professional presentation style" --output slides/background.png + +# Poster hero image +python scripts/generate_image.py "Laboratory setting with modern equipment, photorealistic, well-lit" --output poster/hero.png +``` + +### For General Visual Content +```bash +# Website or documentation images +python scripts/generate_image.py "Professional team collaboration around a digital whiteboard, modern office" --output docs/team_collaboration.png + +# Marketing materials +python scripts/generate_image.py "Futuristic AI brain concept with glowing neural networks" --output marketing/ai_concept.png +``` + +## Error Handling + +The script provides clear error messages for: +- Missing API key (with setup instructions) +- API errors (with status codes) +- Unexpected response formats +- Missing dependencies (requests library) + +If the script fails, read the error message and address the issue before retrying. + +## Notes + +- Images are returned as base64-encoded data URLs and automatically saved as PNG files +- The script supports both `images` and `content` response formats from different OpenRouter models +- Generation time varies by model (typically 5-30 seconds) +- For image editing, the input image is encoded as base64 and sent to the model +- Supported input image formats: PNG, JPEG, GIF, WebP +- Check OpenRouter pricing for cost information: https://openrouter.ai/models + +## Image Editing Tips + +- Be specific about what changes you want (e.g., "change the sky to sunset colors" vs "edit the sky") +- Reference specific elements in the image when possible +- For best results, use clear and detailed editing instructions +- Both Gemini 3 Pro and FLUX.2 Pro support image editing through OpenRouter + +## Integration with Other Skills + +- **scientific-schematics**: Use for technical diagrams, flowcharts, circuits, pathways +- **generate-image**: Use for photos, illustrations, artwork, visual concepts +- **scientific-slides**: Combine with generate-image for visually rich presentations +- **latex-posters**: Use generate-image for poster visuals and hero images diff --git a/scripts/generate_image.py b/scripts/generate_image.py new file mode 100755 index 0000000..1dd71d1 --- /dev/null +++ b/scripts/generate_image.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python3 +""" +Generate and edit images using OpenRouter API with various image generation models. + +Supports models like: +- google/gemini-3-pro-image-preview (generation and editing) +- black-forest-labs/flux.2-pro (generation and editing) +- black-forest-labs/flux.2-flex (generation) +- And more image generation models available on OpenRouter + +For image editing, provide an input image along with an editing prompt. +""" + +import sys +import json +import base64 +import argparse +from pathlib import Path +from typing import Optional + + +def check_env_file() -> Optional[str]: + """Check if .env file exists and contains OPENROUTER_API_KEY.""" + # Look for .env in current directory and parent directories + current_dir = Path.cwd() + for parent in [current_dir] + list(current_dir.parents): + env_file = parent / ".env" + if env_file.exists(): + with open(env_file, 'r') as f: + for line in f: + if line.startswith('OPENROUTER_API_KEY='): + api_key = line.split('=', 1)[1].strip().strip('"').strip("'") + if api_key: + return api_key + return None + + +def load_image_as_base64(image_path: str) -> str: + """Load an image file and return it as a base64 data URL.""" + path = Path(image_path) + if not path.exists(): + print(f"❌ Error: Image file not found: {image_path}") + sys.exit(1) + + # Determine MIME type from extension + ext = path.suffix.lower() + mime_types = { + '.png': 'image/png', + '.jpg': 'image/jpeg', + '.jpeg': 'image/jpeg', + '.gif': 'image/gif', + '.webp': 'image/webp', + } + mime_type = mime_types.get(ext, 'image/png') + + with open(path, 'rb') as f: + image_data = f.read() + + base64_data = base64.b64encode(image_data).decode('utf-8') + return f"data:{mime_type};base64,{base64_data}" + + +def save_base64_image(base64_data: str, output_path: str) -> None: + """Save base64 encoded image to file.""" + # Remove data URL prefix if present + if ',' in base64_data: + base64_data = base64_data.split(',', 1)[1] + + # Decode and save + image_data = base64.b64decode(base64_data) + with open(output_path, 'wb') as f: + f.write(image_data) + + +def generate_image( + prompt: str, + model: str = "google/gemini-3-pro-image-preview", + output_path: str = "generated_image.png", + api_key: Optional[str] = None, + input_image: Optional[str] = None +) -> dict: + """ + Generate or edit an image using OpenRouter API. + + Args: + prompt: Text description of the image to generate, or editing instructions + model: OpenRouter model ID (default: google/gemini-3-pro-image-preview) + output_path: Path to save the generated image + api_key: OpenRouter API key (will check .env if not provided) + input_image: Path to an input image for editing (optional) + + Returns: + dict: Response from OpenRouter API + """ + try: + import requests + except ImportError: + print("Error: 'requests' library not found. Install with: pip install requests") + sys.exit(1) + + # Check for API key + if not api_key: + api_key = check_env_file() + + if not api_key: + print("❌ Error: OPENROUTER_API_KEY not found!") + print("\nPlease create a .env file in your project directory with:") + print("OPENROUTER_API_KEY=your-api-key-here") + print("\nOr set the environment variable:") + print("export OPENROUTER_API_KEY=your-api-key-here") + print("\nGet your API key from: https://openrouter.ai/keys") + sys.exit(1) + + # Determine if this is generation or editing + is_editing = input_image is not None + + if is_editing: + print(f"✏️ Editing image with model: {model}") + print(f"📷 Input image: {input_image}") + print(f"📝 Edit prompt: {prompt}") + + # Load input image as base64 + image_data_url = load_image_as_base64(input_image) + + # Build multimodal message content for image editing + message_content = [ + { + "type": "text", + "text": prompt + }, + { + "type": "image_url", + "image_url": { + "url": image_data_url + } + } + ] + else: + print(f"🎨 Generating image with model: {model}") + print(f"📝 Prompt: {prompt}") + message_content = prompt + + # Make API request + response = requests.post( + url="https://openrouter.ai/api/v1/chat/completions", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + }, + json={ + "model": model, + "messages": [ + { + "role": "user", + "content": message_content + } + ], + "modalities": ["image", "text"] + } + ) + + # Check for errors + if response.status_code != 200: + print(f"❌ API Error ({response.status_code}): {response.text}") + sys.exit(1) + + result = response.json() + + # Extract and save image + if result.get("choices"): + message = result["choices"][0]["message"] + + # Handle both 'images' and 'content' response formats + images = [] + + if message.get("images"): + images = message["images"] + elif message.get("content"): + # Some models return content as array with image parts + content = message["content"] + if isinstance(content, list): + for part in content: + if isinstance(part, dict) and part.get("type") == "image": + images.append(part) + + if images: + # Save the first image + image = images[0] + if "image_url" in image: + image_url = image["image_url"]["url"] + save_base64_image(image_url, output_path) + print(f"✅ Image saved to: {output_path}") + elif "url" in image: + save_base64_image(image["url"], output_path) + print(f"✅ Image saved to: {output_path}") + else: + print(f"⚠️ Unexpected image format: {image}") + else: + print("⚠️ No image found in response") + if message.get("content"): + print(f"Response content: {message['content']}") + else: + print("❌ No choices in response") + print(f"Response: {json.dumps(result, indent=2)}") + + return result + + +def main(): + parser = argparse.ArgumentParser( + description="Generate or edit images using OpenRouter API", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Generate with default model (Gemini 3 Pro Image Preview) + python generate_image.py "A beautiful sunset over mountains" + + # Use a specific model + python generate_image.py "A cat in space" --model "black-forest-labs/flux.2-pro" + + # Specify output path + python generate_image.py "Abstract art" --output my_image.png + + # Edit an existing image + python generate_image.py "Make the sky purple" --input photo.jpg --output edited.png + + # Edit with a specific model + python generate_image.py "Add a hat to the person" --input portrait.png -m "black-forest-labs/flux.2-pro" + +Popular image models: + - google/gemini-3-pro-image-preview (default, high quality, generation + editing) + - black-forest-labs/flux.2-pro (fast, high quality, generation + editing) + - black-forest-labs/flux.2-flex (development version) + """ + ) + + parser.add_argument( + "prompt", + type=str, + help="Text description of the image to generate, or editing instructions" + ) + + parser.add_argument( + "--model", "-m", + type=str, + default="google/gemini-3-pro-image-preview", + help="OpenRouter model ID (default: google/gemini-3-pro-image-preview)" + ) + + parser.add_argument( + "--output", "-o", + type=str, + default="generated_image.png", + help="Output file path (default: generated_image.png)" + ) + + parser.add_argument( + "--input", "-i", + type=str, + help="Input image path for editing (enables edit mode)" + ) + + parser.add_argument( + "--api-key", + type=str, + help="OpenRouter API key (will check .env if not provided)" + ) + + args = parser.parse_args() + + generate_image( + prompt=args.prompt, + model=args.model, + output_path=args.output, + api_key=args.api_key, + input_image=args.input + ) + + +if __name__ == "__main__": + main()