Initial commit for scientific-slides

This commit is contained in:
dfty
2026-01-29 22:15:20 +08:00
commit 0ac8c759ea
12 changed files with 6962 additions and 0 deletions

View File

@@ -0,0 +1,140 @@
#!/usr/bin/env python3
"""
Slide image generation using Nano Banana Pro.
Generate presentation slides or visuals by describing them in natural language.
Nano Banana Pro handles everything automatically with smart iterative refinement.
Two modes:
- Default (full slide): Generate complete slides with title, content, visuals (for PDF workflow)
- Visual only: Generate just images/figures to place on slides (for PPT workflow)
Supports attaching reference images for context (Nano Banana Pro will see these).
Usage:
# Generate full slide for PDF workflow
python generate_slide_image.py "Title: Introduction\\nKey points: AI, ML, Deep Learning" -o slide_01.png
# Generate visual only for PPT workflow
python generate_slide_image.py "Neural network diagram" -o figure.png --visual-only
# With reference images attached
python generate_slide_image.py "Create a slide about this data" -o slide.png --attach chart.png
"""
import argparse
import os
import subprocess
import sys
from pathlib import Path
def main():
"""Command-line interface."""
parser = argparse.ArgumentParser(
description="Generate presentation slides or visuals using Nano Banana Pro AI",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
How it works:
Describe your slide or visual in natural language.
Nano Banana Pro generates it automatically with:
- Smart iteration (only regenerates if quality is below threshold)
- Quality review by Gemini 3 Pro
- Publication-ready output
Modes:
Default (full slide): Generate complete slide with title, content, visuals
Use for PDF workflow where each slide is an image
Visual only: Generate just the image/figure
Use for PPT workflow where you add text separately
Attachments:
Use --attach to provide reference images that Nano Banana Pro will see.
This allows you to say "create a slide about this chart" and attach the chart.
Examples:
# Full slide (default) - for PDF workflow
python generate_slide_image.py "Title: Machine Learning\\nPoints: supervised, unsupervised, reinforcement" -o slide_01.png
# Visual only - for PPT workflow
python generate_slide_image.py "Flowchart showing data pipeline" -o figure.png --visual-only
# With reference images attached
python generate_slide_image.py "Create a slide explaining this chart" -o slide.png --attach chart.png
python generate_slide_image.py "Combine these into a comparison" -o compare.png --attach before.png --attach after.png
# Multiple slides for PDF
python generate_slide_image.py "Title slide: AI Conference 2025" -o slides/01_title.png
python generate_slide_image.py "Title: Introduction\\nOverview of deep learning" -o slides/02_intro.png
Environment Variables:
OPENROUTER_API_KEY Required for AI generation
"""
)
parser.add_argument("prompt", help="Description of the slide or visual to generate")
parser.add_argument("-o", "--output", required=True, help="Output file path")
parser.add_argument("--attach", action="append", dest="attachments", metavar="IMAGE",
help="Attach image file(s) as context (can use multiple times)")
parser.add_argument("--visual-only", action="store_true",
help="Generate just the visual/figure (for PPT workflow)")
parser.add_argument("--iterations", type=int, default=2,
help="Maximum refinement iterations (default: 2, max: 2)")
parser.add_argument("--api-key", help="OpenRouter API key (or use OPENROUTER_API_KEY env var)")
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
args = parser.parse_args()
# Check for API key
api_key = args.api_key or os.getenv("OPENROUTER_API_KEY")
if not api_key:
print("Error: OPENROUTER_API_KEY environment variable not set")
print("\nFor AI generation, you need an OpenRouter API key.")
print("Get one at: https://openrouter.ai/keys")
print("\nSet it with:")
print(" export OPENROUTER_API_KEY='your_api_key'")
print("\nOr use --api-key flag")
sys.exit(1)
# Find AI generation script
script_dir = Path(__file__).parent
ai_script = script_dir / "generate_slide_image_ai.py"
if not ai_script.exists():
print(f"Error: AI generation script not found: {ai_script}")
sys.exit(1)
# Build command
cmd = [sys.executable, str(ai_script), args.prompt, "-o", args.output]
# Add attachments
if args.attachments:
for att in args.attachments:
cmd.extend(["--attach", att])
if args.visual_only:
cmd.append("--visual-only")
# Enforce max 2 iterations
iterations = min(args.iterations, 2)
if iterations != 2:
cmd.extend(["--iterations", str(iterations)])
if api_key:
cmd.extend(["--api-key", api_key])
if args.verbose:
cmd.append("-v")
# Execute
try:
result = subprocess.run(cmd, check=False)
sys.exit(result.returncode)
except Exception as e:
print(f"Error executing AI generation: {e}")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,763 @@
#!/usr/bin/env python3
"""
AI-powered slide image generation using Nano Banana Pro.
This script generates presentation slides or slide visuals using AI:
- full_slide mode: Generate complete slides with title, content, and visuals (for PDF workflow)
- visual_only mode: Generate just images/figures to place on slides (for PPT workflow)
Supports attaching reference images for context (e.g., "create a slide about this chart").
Uses smart iterative refinement:
1. Generate initial image with Nano Banana Pro
2. Quality review using Gemini 3 Pro
3. Only regenerate if quality is below threshold
4. Repeat until quality meets standards (max iterations)
Requirements:
- OPENROUTER_API_KEY environment variable
- requests library
Usage:
# Full slide for PDF workflow
python generate_slide_image_ai.py "Title: Introduction to ML\nKey points: supervised learning, neural networks" -o slide_01.png
# Visual only for PPT workflow
python generate_slide_image_ai.py "Neural network architecture diagram" -o figure.png --visual-only
# With reference images attached
python generate_slide_image_ai.py "Create a slide explaining this chart" -o slide.png --attach chart.png --attach logo.png
"""
import argparse
import base64
import json
import os
import sys
import time
from pathlib import Path
from typing import Optional, Dict, Any, List, Tuple
try:
import requests
except ImportError:
print("Error: requests library not found. Install with: pip install requests")
sys.exit(1)
def _load_env_file():
"""Load .env file from current directory, parent directories, or package directory."""
try:
from dotenv import load_dotenv
except ImportError:
return False
# Try current working directory first
env_path = Path.cwd() / ".env"
if env_path.exists():
load_dotenv(dotenv_path=env_path, override=False)
return True
# Try parent directories (up to 5 levels)
cwd = Path.cwd()
for _ in range(5):
env_path = cwd / ".env"
if env_path.exists():
load_dotenv(dotenv_path=env_path, override=False)
return True
cwd = cwd.parent
if cwd == cwd.parent:
break
# Try the package's parent directory
script_dir = Path(__file__).resolve().parent
for _ in range(5):
env_path = script_dir / ".env"
if env_path.exists():
load_dotenv(dotenv_path=env_path, override=False)
return True
script_dir = script_dir.parent
if script_dir == script_dir.parent:
break
return False
class SlideImageGenerator:
"""Generate presentation slides or visuals using AI with iterative refinement.
Two modes:
- full_slide: Generate complete slide with title, content, visuals (for PDF workflow)
- visual_only: Generate just the image/figure for a slide (for PPT workflow)
"""
# Quality threshold for presentations (lower than journal/conference papers)
QUALITY_THRESHOLD = 6.5
# Guidelines for generating full slides (complete slide images)
FULL_SLIDE_GUIDELINES = """
Create a professional presentation slide image with these requirements:
SLIDE LAYOUT (16:9 aspect ratio):
- Clean, modern slide design
- Clear visual hierarchy: title at top, content below
- Generous margins (at least 5% on all sides)
- Balanced composition with intentional white space
TYPOGRAPHY:
- LARGE, bold title text (easily readable from distance)
- Clear, sans-serif fonts throughout
- High contrast text (dark on light or light on dark)
- Bullet points or key phrases, NOT paragraphs
- Maximum 5-6 lines of text content
- Default author/presenter: "K-Dense" (use this unless another name is specified)
VISUAL ELEMENTS:
- Use GENERIC, simple images and icons - avoid overly specific or detailed imagery
- MINIMAL extra elements - no decorative borders, shadows, or flourishes
- Visuals should support and enhance the message, not distract
- Professional, clean aesthetic with restraint
- Consistent color scheme (2-3 main colors only)
- Prefer abstract/conceptual visuals over literal representations
PROFESSIONAL MINIMALISM:
- Less is more: favor empty space over additional elements
- No unnecessary decorations, gradients, or visual noise
- Clean lines and simple shapes
- Focused content without visual clutter
- Corporate/academic level of professionalism
PRESENTATION QUALITY:
- Designed for projection (high contrast)
- Bold, impactful design that commands attention
- Professional and polished appearance
- No cluttered or busy layouts
- Consistent styling throughout the deck
"""
# Guidelines for generating slide visuals only (figures/images for PPT)
VISUAL_ONLY_GUIDELINES = """
Create a high-quality visual/figure for a presentation slide:
IMAGE QUALITY:
- Clean, professional appearance
- High resolution and sharp details
- Suitable for embedding in a slide
DESIGN:
- Simple, clear composition with MINIMAL elements
- High contrast for projection readability
- No text unless essential to the visual
- Transparent or white background preferred
- GENERIC imagery - avoid overly specific or detailed visuals
PROFESSIONAL MINIMALISM:
- Favor simplicity over complexity
- No decorative elements, shadows, or flourishes
- Clean lines and simple shapes only
- Remove any unnecessary visual noise
- Abstract/conceptual rather than literal representations
STYLE:
- Modern, professional aesthetic
- Colorblind-friendly colors
- Bold but restrained imagery
- Suitable for scientific/professional presentations
- Corporate/academic level of polish
"""
def __init__(self, api_key: Optional[str] = None, verbose: bool = False):
"""
Initialize the generator.
Args:
api_key: OpenRouter API key (or use OPENROUTER_API_KEY env var)
verbose: Print detailed progress information
"""
self.api_key = api_key or os.getenv("OPENROUTER_API_KEY")
if not self.api_key:
_load_env_file()
self.api_key = os.getenv("OPENROUTER_API_KEY")
if not self.api_key:
raise ValueError(
"OPENROUTER_API_KEY not found. Please either:\n"
" 1. Set the OPENROUTER_API_KEY environment variable\n"
" 2. Add OPENROUTER_API_KEY to your .env file\n"
" 3. Pass api_key parameter to the constructor\n"
"Get your API key from: https://openrouter.ai/keys"
)
self.verbose = verbose
self._last_error = None
self.base_url = "https://openrouter.ai/api/v1"
# Nano Banana Pro for image generation
self.image_model = "google/gemini-3-pro-image-preview"
# Gemini 3 Pro for quality review
self.review_model = "google/gemini-3-pro"
def _log(self, message: str):
"""Log message if verbose mode is enabled."""
if self.verbose:
print(f"[{time.strftime('%H:%M:%S')}] {message}")
def _make_request(self, model: str, messages: List[Dict[str, Any]],
modalities: Optional[List[str]] = None) -> Dict[str, Any]:
"""Make a request to OpenRouter API."""
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
"HTTP-Referer": "https://github.com/scientific-writer",
"X-Title": "Scientific Slide Generator"
}
payload = {
"model": model,
"messages": messages
}
if modalities:
payload["modalities"] = modalities
self._log(f"Making request to {model}...")
try:
response = requests.post(
f"{self.base_url}/chat/completions",
headers=headers,
json=payload,
timeout=120
)
try:
response_json = response.json()
except json.JSONDecodeError:
response_json = {"raw_text": response.text[:500]}
if response.status_code != 200:
error_detail = response_json.get("error", response_json)
self._log(f"HTTP {response.status_code}: {error_detail}")
raise RuntimeError(f"API request failed (HTTP {response.status_code}): {error_detail}")
return response_json
except requests.exceptions.Timeout:
raise RuntimeError("API request timed out after 120 seconds")
except requests.exceptions.RequestException as e:
raise RuntimeError(f"API request failed: {str(e)}")
def _extract_image_from_response(self, response: Dict[str, Any]) -> Optional[bytes]:
"""Extract base64-encoded image from API response."""
try:
choices = response.get("choices", [])
if not choices:
self._log("No choices in response")
return None
message = choices[0].get("message", {})
# Nano Banana Pro returns images in the 'images' field
images = message.get("images", [])
if images and len(images) > 0:
self._log(f"Found {len(images)} image(s) in 'images' field")
first_image = images[0]
if isinstance(first_image, dict):
if first_image.get("type") == "image_url":
url = first_image.get("image_url", {})
if isinstance(url, dict):
url = url.get("url", "")
if url and url.startswith("data:image"):
if "," in url:
base64_str = url.split(",", 1)[1]
base64_str = base64_str.replace('\n', '').replace('\r', '').replace(' ', '')
self._log(f"Extracted base64 data (length: {len(base64_str)})")
return base64.b64decode(base64_str)
# Fallback: check content field
content = message.get("content", "")
if isinstance(content, str) and "data:image" in content:
import re
match = re.search(r'data:image/[^;]+;base64,([A-Za-z0-9+/=\n\r]+)', content, re.DOTALL)
if match:
base64_str = match.group(1).replace('\n', '').replace('\r', '').replace(' ', '')
self._log(f"Found image in content field (length: {len(base64_str)})")
return base64.b64decode(base64_str)
if isinstance(content, list):
for i, block in enumerate(content):
if isinstance(block, dict) and block.get("type") == "image_url":
url = block.get("image_url", {})
if isinstance(url, dict):
url = url.get("url", "")
if url and url.startswith("data:image") and "," in url:
base64_str = url.split(",", 1)[1].replace('\n', '').replace('\r', '').replace(' ', '')
self._log(f"Found image in content block {i}")
return base64.b64decode(base64_str)
self._log("No image data found in response")
return None
except Exception as e:
self._log(f"Error extracting image: {str(e)}")
return None
def _image_to_base64(self, image_path: str) -> str:
"""Convert image file to base64 data URL."""
with open(image_path, "rb") as f:
image_data = f.read()
ext = Path(image_path).suffix.lower()
mime_type = {
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".gif": "image/gif",
".webp": "image/webp"
}.get(ext, "image/png")
base64_data = base64.b64encode(image_data).decode("utf-8")
return f"data:{mime_type};base64,{base64_data}"
def generate_image(self, prompt: str, attachments: Optional[List[str]] = None) -> Optional[bytes]:
"""
Generate an image using Nano Banana Pro.
Args:
prompt: Text description of the image to generate
attachments: Optional list of image file paths to attach as context
Returns:
Image bytes or None if generation failed
"""
self._last_error = None
# Build content with text and optional image attachments
content = []
# Add text prompt
content.append({
"type": "text",
"text": prompt
})
# Add attached images as context
if attachments:
for img_path in attachments:
try:
img_data_url = self._image_to_base64(img_path)
content.append({
"type": "image_url",
"image_url": {"url": img_data_url}
})
self._log(f"Attached image: {img_path}")
except Exception as e:
self._log(f"Warning: Could not attach {img_path}: {e}")
messages = [
{
"role": "user",
"content": content if attachments else prompt
}
]
try:
response = self._make_request(
model=self.image_model,
messages=messages,
modalities=["image", "text"]
)
if self.verbose:
self._log(f"Response keys: {response.keys()}")
if "error" in response:
self._log(f"API Error: {response['error']}")
if "error" in response:
error_msg = response["error"]
if isinstance(error_msg, dict):
error_msg = error_msg.get("message", str(error_msg))
self._last_error = f"API Error: {error_msg}"
print(f"{self._last_error}")
return None
image_data = self._extract_image_from_response(response)
if image_data:
self._log(f"✓ Generated image ({len(image_data)} bytes)")
else:
self._last_error = "No image data in API response"
self._log(f"{self._last_error}")
return image_data
except RuntimeError as e:
self._last_error = str(e)
self._log(f"✗ Generation failed: {self._last_error}")
return None
except Exception as e:
self._last_error = f"Unexpected error: {str(e)}"
self._log(f"✗ Generation failed: {self._last_error}")
return None
def review_image(self, image_path: str, original_prompt: str,
iteration: int, visual_only: bool = False,
max_iterations: int = 2) -> Tuple[str, float, bool]:
"""Review generated image using Gemini 3 Pro."""
image_data_url = self._image_to_base64(image_path)
threshold = self.QUALITY_THRESHOLD
image_type = "slide visual/figure" if visual_only else "presentation slide"
review_prompt = f"""You are an expert reviewer evaluating a {image_type} for presentation quality.
ORIGINAL REQUEST: {original_prompt}
QUALITY THRESHOLD: {threshold}/10
ITERATION: {iteration}/{max_iterations}
Evaluate this {image_type} on these criteria:
1. **Visual Impact** (0-2 points)
- Bold, attention-grabbing design
- Professional appearance
- Suitable for projection
2. **Clarity** (0-2 points)
- Easy to understand at a glance
- Clear visual hierarchy
- Not cluttered or busy
3. **Readability** (0-2 points)
- Text is large and readable (if present)
- High contrast
- Clean typography
4. **Composition** (0-2 points)
- Balanced layout
- Good use of space
- Appropriate margins
5. **Relevance** (0-2 points)
- Matches the requested content
- Appropriate style for presentations
- Professional quality
RESPOND IN THIS EXACT FORMAT:
SCORE: [total score 0-10]
STRENGTHS:
- [strength 1]
- [strength 2]
ISSUES:
- [issue 1 if any]
- [issue 2 if any]
VERDICT: [ACCEPTABLE or NEEDS_IMPROVEMENT]
If score >= {threshold}, the image is ACCEPTABLE.
If score < {threshold}, mark as NEEDS_IMPROVEMENT with specific suggestions."""
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": review_prompt},
{"type": "image_url", "image_url": {"url": image_data_url}}
]
}
]
try:
response = self._make_request(model=self.review_model, messages=messages)
choices = response.get("choices", [])
if not choices:
return "Image generated successfully", 7.0, False
message = choices[0].get("message", {})
content = message.get("content", "")
reasoning = message.get("reasoning", "")
if reasoning and not content:
content = reasoning
if isinstance(content, list):
text_parts = []
for block in content:
if isinstance(block, dict) and block.get("type") == "text":
text_parts.append(block.get("text", ""))
content = "\n".join(text_parts)
# Extract score
score = 7.0
import re
score_match = re.search(r'SCORE:\s*(\d+(?:\.\d+)?)', content, re.IGNORECASE)
if score_match:
score = float(score_match.group(1))
else:
score_match = re.search(r'(?:score|rating|quality)[:\s]+(\d+(?:\.\d+)?)', content, re.IGNORECASE)
if score_match:
score = float(score_match.group(1))
needs_improvement = False
if "NEEDS_IMPROVEMENT" in content.upper():
needs_improvement = True
elif score < threshold:
needs_improvement = True
self._log(f"✓ Review complete (Score: {score}/10, Threshold: {threshold}/10)")
return (content if content else "Image generated successfully", score, needs_improvement)
except Exception as e:
self._log(f"Review skipped: {str(e)}")
return "Image generated successfully (review skipped)", 7.0, False
def improve_prompt(self, original_prompt: str, critique: str,
iteration: int, visual_only: bool = False) -> str:
"""Improve the generation prompt based on critique."""
guidelines = self.VISUAL_ONLY_GUIDELINES if visual_only else self.FULL_SLIDE_GUIDELINES
return f"""{guidelines}
USER REQUEST: {original_prompt}
ITERATION {iteration}: Based on previous feedback, address these specific improvements:
{critique}
Generate an improved version that addresses all the critique points."""
def generate_slide(self, user_prompt: str, output_path: str,
visual_only: bool = False,
iterations: int = 2,
attachments: Optional[List[str]] = None) -> Dict[str, Any]:
"""
Generate a slide image or visual with iterative refinement.
Args:
user_prompt: Description of the slide/visual to generate
output_path: Path to save final image
visual_only: If True, generate just the visual (for PPT workflow)
iterations: Maximum refinement iterations (default: 2)
attachments: Optional list of image file paths to attach as context
Returns:
Dictionary with generation results and metadata
"""
output_path = Path(output_path)
output_dir = output_path.parent
output_dir.mkdir(parents=True, exist_ok=True)
base_name = output_path.stem
extension = output_path.suffix or ".png"
mode = "visual_only" if visual_only else "full_slide"
guidelines = self.VISUAL_ONLY_GUIDELINES if visual_only else self.FULL_SLIDE_GUIDELINES
results = {
"user_prompt": user_prompt,
"mode": mode,
"quality_threshold": self.QUALITY_THRESHOLD,
"attachments": attachments or [],
"iterations": [],
"final_image": None,
"final_score": 0.0,
"success": False,
"early_stop": False
}
current_prompt = f"""{guidelines}
USER REQUEST: {user_prompt}
Generate a high-quality {'visual/figure' if visual_only else 'presentation slide'} that meets all the guidelines above."""
print(f"\n{'='*60}")
print(f"Generating Slide {'Visual' if visual_only else 'Image'}")
print(f"{'='*60}")
print(f"Description: {user_prompt[:100]}{'...' if len(user_prompt) > 100 else ''}")
print(f"Mode: {mode}")
if attachments:
print(f"Attachments: {len(attachments)} image(s)")
for att in attachments:
print(f" - {att}")
print(f"Quality Threshold: {self.QUALITY_THRESHOLD}/10")
print(f"Max Iterations: {iterations}")
print(f"Output: {output_path}")
print(f"{'='*60}\n")
# Track temporary files for cleanup
temp_files = []
final_image_data = None
for i in range(1, iterations + 1):
print(f"\n[Iteration {i}/{iterations}]")
print("-" * 40)
print(f"Generating image with Nano Banana Pro...")
image_data = self.generate_image(current_prompt, attachments=attachments)
if not image_data:
error_msg = self._last_error or 'Image generation failed'
print(f"✗ Generation failed: {error_msg}")
results["iterations"].append({
"iteration": i,
"success": False,
"error": error_msg
})
continue
# Save to temporary file for review (will be cleaned up)
import tempfile
temp_fd, temp_path = tempfile.mkstemp(suffix=extension)
os.close(temp_fd)
temp_path = Path(temp_path)
temp_files.append(temp_path)
with open(temp_path, "wb") as f:
f.write(image_data)
print(f"✓ Generated image (iteration {i})")
print(f"Reviewing image with Gemini 3 Pro...")
critique, score, needs_improvement = self.review_image(
str(temp_path), user_prompt, i, visual_only, iterations
)
print(f"✓ Score: {score}/10 (threshold: {self.QUALITY_THRESHOLD}/10)")
results["iterations"].append({
"iteration": i,
"critique": critique,
"score": score,
"needs_improvement": needs_improvement,
"success": True
})
if not needs_improvement:
print(f"\n✓ Quality meets threshold ({score} >= {self.QUALITY_THRESHOLD})")
final_image_data = image_data
results["final_score"] = score
results["success"] = True
results["early_stop"] = True
break
if i == iterations:
print(f"\n⚠ Maximum iterations reached")
final_image_data = image_data
results["final_score"] = score
results["success"] = True
break
print(f"\n⚠ Quality below threshold ({score} < {self.QUALITY_THRESHOLD})")
print(f"Improving prompt...")
current_prompt = self.improve_prompt(user_prompt, critique, i + 1, visual_only)
# Clean up temporary files
for temp_file in temp_files:
try:
if temp_file.exists():
temp_file.unlink()
except Exception:
pass
# Save only the final image to output path
if results["success"] and final_image_data:
with open(output_path, "wb") as f:
f.write(final_image_data)
results["final_image"] = str(output_path)
print(f"\n✓ Final image: {output_path}")
print(f"\n{'='*60}")
print(f"Generation Complete!")
print(f"Final Score: {results['final_score']}/10")
if results["early_stop"]:
success_count = len([r for r in results['iterations'] if r.get('success')])
print(f"Iterations Used: {success_count}/{iterations} (early stop)")
print(f"{'='*60}\n")
return results
def main():
"""Command-line interface."""
parser = argparse.ArgumentParser(
description="Generate presentation slides or visuals using Nano Banana Pro AI",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Generate a full slide (for PDF workflow)
python generate_slide_image_ai.py "Title: Machine Learning Basics\\nKey points: supervised learning, neural networks, deep learning" -o slide_01.png
# Generate just a visual/figure (for PPT workflow)
python generate_slide_image_ai.py "Neural network architecture diagram with input, hidden, and output layers" -o figure.png --visual-only
# With reference images attached (Nano Banana Pro will see these)
python generate_slide_image_ai.py "Create a slide explaining this chart with key insights" -o slide.png --attach chart.png
python generate_slide_image_ai.py "Combine these images into a comparison slide" -o compare.png --attach before.png --attach after.png
# With custom iterations
python generate_slide_image_ai.py "Title slide for AI Conference 2025" -o title.png --iterations 2
# Verbose output
python generate_slide_image_ai.py "Data flow diagram" -o flow.png -v
Environment:
OPENROUTER_API_KEY OpenRouter API key (required)
"""
)
parser.add_argument("prompt", help="Description of the slide or visual to generate")
parser.add_argument("-o", "--output", required=True, help="Output image path")
parser.add_argument("--attach", action="append", dest="attachments", metavar="IMAGE",
help="Attach image file(s) as context for generation (can use multiple times)")
parser.add_argument("--visual-only", action="store_true",
help="Generate just the visual/figure (for PPT workflow)")
parser.add_argument("--iterations", type=int, default=2,
help="Maximum refinement iterations (default: 2)")
parser.add_argument("--api-key", help="OpenRouter API key (or set OPENROUTER_API_KEY)")
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
args = parser.parse_args()
api_key = args.api_key or os.getenv("OPENROUTER_API_KEY")
if not api_key:
print("Error: OPENROUTER_API_KEY environment variable not set")
print("\nSet it with:")
print(" export OPENROUTER_API_KEY='your_api_key'")
sys.exit(1)
if args.iterations < 1 or args.iterations > 2:
print("Error: Iterations must be between 1 and 2")
sys.exit(1)
# Validate attachments exist
if args.attachments:
for att in args.attachments:
if not Path(att).exists():
print(f"Error: Attachment file not found: {att}")
sys.exit(1)
try:
generator = SlideImageGenerator(api_key=api_key, verbose=args.verbose)
results = generator.generate_slide(
user_prompt=args.prompt,
output_path=args.output,
visual_only=args.visual_only,
iterations=args.iterations,
attachments=args.attachments
)
if results["success"]:
print(f"\n✓ Success! Image saved to: {args.output}")
sys.exit(0)
else:
print(f"\n✗ Generation failed. Check review log for details.")
sys.exit(1)
except Exception as e:
print(f"\n✗ Error: {str(e)}")
sys.exit(1)
if __name__ == "__main__":
main()

221
scripts/pdf_to_images.py Normal file
View File

@@ -0,0 +1,221 @@
#!/usr/bin/env python3
"""
PDF to Images Converter for Presentations
Converts presentation PDFs to images for visual inspection and review.
Supports multiple output formats and resolutions.
Uses PyMuPDF (fitz) as the primary conversion method - no external
dependencies required (no poppler, ghostscript, or ImageMagick needed).
"""
import sys
import argparse
from pathlib import Path
from typing import Optional, List
# Try to import pymupdf (preferred - no external dependencies)
try:
import fitz # PyMuPDF
HAS_PYMUPDF = True
except ImportError:
HAS_PYMUPDF = False
class PDFToImagesConverter:
"""Converts PDF presentations to images."""
def __init__(
self,
pdf_path: str,
output_prefix: str,
dpi: int = 150,
format: str = 'jpg',
first_page: Optional[int] = None,
last_page: Optional[int] = None
):
self.pdf_path = Path(pdf_path)
self.output_prefix = output_prefix
self.dpi = dpi
self.format = format.lower()
self.first_page = first_page
self.last_page = last_page
# Validate format
if self.format not in ['jpg', 'jpeg', 'png']:
raise ValueError(f"Unsupported format: {format}. Use jpg or png.")
def convert(self) -> List[Path]:
"""Convert PDF to images using PyMuPDF."""
if not self.pdf_path.exists():
raise FileNotFoundError(f"PDF not found: {self.pdf_path}")
print(f"Converting: {self.pdf_path.name}")
print(f"Output prefix: {self.output_prefix}")
print(f"DPI: {self.dpi}")
print(f"Format: {self.format}")
if HAS_PYMUPDF:
return self._convert_with_pymupdf()
else:
raise RuntimeError(
"PyMuPDF not installed. Install it with:\n"
" pip install pymupdf\n\n"
"PyMuPDF is a self-contained library - no external dependencies needed."
)
def _convert_with_pymupdf(self) -> List[Path]:
"""Convert using PyMuPDF library (no external dependencies)."""
print("Using PyMuPDF (no external dependencies required)...")
# Open the PDF
doc = fitz.open(self.pdf_path)
# Determine page range
start_page = (self.first_page - 1) if self.first_page else 0
end_page = self.last_page if self.last_page else doc.page_count
# Calculate zoom factor from DPI (72 DPI is the base)
zoom = self.dpi / 72
matrix = fitz.Matrix(zoom, zoom)
output_files = []
output_dir = Path(self.output_prefix).parent
output_dir.mkdir(parents=True, exist_ok=True)
for page_num in range(start_page, end_page):
page = doc[page_num]
# Render page to pixmap
pixmap = page.get_pixmap(matrix=matrix)
# Determine output path
output_path = Path(f"{self.output_prefix}-{page_num + 1:03d}.{self.format}")
# Save the image
if self.format in ['jpg', 'jpeg']:
pixmap.save(str(output_path), output="jpeg")
else:
pixmap.save(str(output_path), output="png")
output_files.append(output_path)
print(f" Created: {output_path.name}")
doc.close()
return output_files
def main():
parser = argparse.ArgumentParser(
description='Convert presentation PDFs to images',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s presentation.pdf slides
→ Creates slides-001.jpg, slides-002.jpg, ...
%(prog)s presentation.pdf output/slide --dpi 300 --format png
→ Creates output/slide-001.png, slide-002.png, ... at high resolution
%(prog)s presentation.pdf review/s --first 5 --last 10
→ Converts only slides 5-10
Output:
Images are named: PREFIX-001.FORMAT, PREFIX-002.FORMAT, etc.
Resolution:
- 150 DPI: Good for screen review (default)
- 200 DPI: Higher quality for detailed inspection
- 300 DPI: Print quality (larger files)
Requirements:
Install PyMuPDF (no external dependencies needed):
pip install pymupdf
"""
)
parser.add_argument(
'pdf_path',
help='Path to PDF presentation'
)
parser.add_argument(
'output_prefix',
help='Output filename prefix (e.g., "slides" or "output/slide")'
)
parser.add_argument(
'--dpi', '-r',
type=int,
default=150,
help='Resolution in DPI (default: 150)'
)
parser.add_argument(
'--format', '-f',
choices=['jpg', 'jpeg', 'png'],
default='jpg',
help='Output format (default: jpg)'
)
parser.add_argument(
'--first',
type=int,
help='First page to convert (1-indexed)'
)
parser.add_argument(
'--last',
type=int,
help='Last page to convert (1-indexed)'
)
args = parser.parse_args()
# Create output directory if needed
output_dir = Path(args.output_prefix).parent
if output_dir != Path('.'):
output_dir.mkdir(parents=True, exist_ok=True)
# Convert
try:
converter = PDFToImagesConverter(
pdf_path=args.pdf_path,
output_prefix=args.output_prefix,
dpi=args.dpi,
format=args.format,
first_page=args.first,
last_page=args.last
)
output_files = converter.convert()
print()
print("=" * 60)
print(f"✅ Success! Created {len(output_files)} image(s)")
print("=" * 60)
if output_files:
print(f"\nFirst image: {output_files[0]}")
print(f"Last image: {output_files[-1]}")
# Calculate total size
total_size = sum(f.stat().st_size for f in output_files)
size_mb = total_size / (1024 * 1024)
print(f"Total size: {size_mb:.2f} MB")
print("\nNext steps:")
print(" 1. Review images for layout issues")
print(" 2. Check for text overflow or element overlap")
print(" 3. Verify readability from distance")
print(" 4. Document issues with slide numbers")
sys.exit(0)
except Exception as e:
print(f"\n❌ Error: {str(e)}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

235
scripts/slides_to_pdf.py Normal file
View File

@@ -0,0 +1,235 @@
#!/usr/bin/env python3
"""
Combine slide images into a single PDF presentation.
This script takes multiple slide images (PNG, JPG) and combines them
into a single PDF file, maintaining aspect ratio and quality.
Usage:
# Combine all PNG files in a directory
python slides_to_pdf.py slides/*.png -o presentation.pdf
# Combine specific files in order
python slides_to_pdf.py slide_01.png slide_02.png slide_03.png -o presentation.pdf
# From a directory (sorted by filename)
python slides_to_pdf.py slides/ -o presentation.pdf
"""
import argparse
import sys
from pathlib import Path
from typing import List
try:
from PIL import Image
except ImportError:
print("Error: Pillow library not found. Install with: pip install Pillow")
sys.exit(1)
def get_image_files(paths: List[str]) -> List[Path]:
"""
Get list of image files from paths (files or directories).
Args:
paths: List of file paths or directory paths
Returns:
Sorted list of image file paths
"""
image_extensions = {'.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp'}
image_files = []
for path_str in paths:
path = Path(path_str)
if path.is_file():
if path.suffix.lower() in image_extensions:
image_files.append(path)
else:
print(f"Warning: Skipping non-image file: {path}")
elif path.is_dir():
# Get all images in directory
for ext in image_extensions:
image_files.extend(path.glob(f"*{ext}"))
image_files.extend(path.glob(f"*{ext.upper()}"))
else:
# Try glob pattern
parent = path.parent
pattern = path.name
if parent.exists():
matches = list(parent.glob(pattern))
for match in matches:
if match.suffix.lower() in image_extensions:
image_files.append(match)
# Remove duplicates and sort
image_files = list(set(image_files))
image_files.sort(key=lambda x: x.name)
return image_files
def combine_images_to_pdf(image_paths: List[Path], output_path: Path,
dpi: int = 150, verbose: bool = False) -> bool:
"""
Combine multiple images into a single PDF.
Args:
image_paths: List of image file paths
output_path: Output PDF path
dpi: Resolution for the PDF (default: 150)
verbose: Print progress information
Returns:
True if successful, False otherwise
"""
if not image_paths:
print("Error: No image files found")
return False
if verbose:
print(f"Combining {len(image_paths)} images into PDF...")
# Load all images
images = []
for i, img_path in enumerate(image_paths):
try:
img = Image.open(img_path)
# Convert to RGB if necessary (PDF doesn't support RGBA)
if img.mode in ('RGBA', 'P'):
# Create white background
background = Image.new('RGB', img.size, (255, 255, 255))
if img.mode == 'P':
img = img.convert('RGBA')
background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
img = background
elif img.mode != 'RGB':
img = img.convert('RGB')
images.append(img)
if verbose:
print(f" [{i+1}/{len(image_paths)}] Loaded: {img_path.name} ({img.size[0]}x{img.size[1]})")
except Exception as e:
print(f"Error loading {img_path}: {e}")
return False
if not images:
print("Error: No images could be loaded")
return False
# Create output directory if needed
output_path.parent.mkdir(parents=True, exist_ok=True)
# Save as PDF
try:
# First image
first_image = images[0]
# Remaining images (if any)
remaining_images = images[1:] if len(images) > 1 else []
# Save to PDF
first_image.save(
output_path,
"PDF",
resolution=dpi,
save_all=True,
append_images=remaining_images
)
if verbose:
print(f"\n✓ PDF created: {output_path}")
print(f" Total slides: {len(images)}")
file_size = output_path.stat().st_size
if file_size > 1024 * 1024:
print(f" File size: {file_size / (1024 * 1024):.1f} MB")
else:
print(f" File size: {file_size / 1024:.1f} KB")
return True
except Exception as e:
print(f"Error creating PDF: {e}")
return False
finally:
# Close all images
for img in images:
img.close()
def main():
"""Command-line interface."""
parser = argparse.ArgumentParser(
description="Combine slide images into a single PDF presentation",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Combine PNG files using glob pattern
python slides_to_pdf.py slides/*.png -o presentation.pdf
# Combine specific files in order
python slides_to_pdf.py title.png intro.png methods.png results.png -o talk.pdf
# Combine all images from a directory (sorted by filename)
python slides_to_pdf.py slides/ -o presentation.pdf
# With custom DPI and verbose output
python slides_to_pdf.py slides/*.png -o presentation.pdf --dpi 200 -v
Supported formats: PNG, JPG, JPEG, GIF, WEBP, BMP
Tips:
- Name your slide images with numbers for correct ordering:
01_title.png, 02_intro.png, 03_methods.png, etc.
- Use the generate_slide_image.py script to create slides first
- Standard presentation aspect ratio is 16:9 (1920x1080 or 1280x720)
"""
)
parser.add_argument("images", nargs="+",
help="Image files, directories, or glob patterns")
parser.add_argument("-o", "--output", required=True,
help="Output PDF file path")
parser.add_argument("--dpi", type=int, default=150,
help="PDF resolution in DPI (default: 150)")
parser.add_argument("-v", "--verbose", action="store_true",
help="Verbose output")
args = parser.parse_args()
# Get image files
image_files = get_image_files(args.images)
if not image_files:
print("Error: No image files found matching the specified paths")
print("\nUsage examples:")
print(" python slides_to_pdf.py slides/*.png -o presentation.pdf")
print(" python slides_to_pdf.py slide1.png slide2.png -o presentation.pdf")
sys.exit(1)
print(f"Found {len(image_files)} image(s)")
if args.verbose:
for f in image_files:
print(f" - {f}")
# Combine into PDF
output_path = Path(args.output)
success = combine_images_to_pdf(
image_files,
output_path,
dpi=args.dpi,
verbose=args.verbose
)
if success:
print(f"\n✓ PDF created: {output_path}")
sys.exit(0)
else:
print(f"\n✗ Failed to create PDF")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,403 @@
#!/usr/bin/env python3
"""
Presentation Validation Script
Validates scientific presentations for common issues:
- Slide count vs. duration
- LaTeX compilation
- File size checks
- Basic format validation
"""
import sys
import os
import argparse
import subprocess
from pathlib import Path
from typing import Dict, List, Tuple, Optional
# Try to import PyPDF2 for PDF analysis
try:
import PyPDF2
HAS_PYPDF2 = True
except ImportError:
HAS_PYPDF2 = False
# Try to import python-pptx for PowerPoint analysis
try:
from pptx import Presentation
HAS_PPTX = True
except ImportError:
HAS_PPTX = False
class PresentationValidator:
"""Validates presentations for common issues."""
# Recommended slide counts by duration (min, recommended, max)
SLIDE_GUIDELINES = {
5: (5, 6, 8),
10: (8, 11, 14),
15: (13, 16, 20),
20: (18, 22, 26),
30: (22, 27, 33),
45: (32, 40, 50),
60: (40, 52, 65),
}
def __init__(self, filepath: str, duration: Optional[int] = None):
self.filepath = Path(filepath)
self.duration = duration
self.file_type = self.filepath.suffix.lower()
self.issues = []
self.warnings = []
self.info = []
def validate(self) -> Dict:
"""Run all validations and return results."""
print(f"Validating: {self.filepath.name}")
print(f"File type: {self.file_type}")
print("=" * 60)
# Check file exists
if not self.filepath.exists():
self.issues.append(f"File not found: {self.filepath}")
return self._format_results()
# File size check
self._check_file_size()
# Type-specific validation
if self.file_type == '.pdf':
self._validate_pdf()
elif self.file_type in ['.pptx', '.ppt']:
self._validate_pptx()
elif self.file_type in ['.tex']:
self._validate_latex()
else:
self.warnings.append(f"Unknown file type: {self.file_type}")
return self._format_results()
def _check_file_size(self):
"""Check if file size is reasonable."""
size_mb = self.filepath.stat().st_size / (1024 * 1024)
self.info.append(f"File size: {size_mb:.2f} MB")
if size_mb > 100:
self.issues.append(
f"File is very large ({size_mb:.1f} MB). "
"Consider compressing images."
)
elif size_mb > 50:
self.warnings.append(
f"File is large ({size_mb:.1f} MB). "
"May be slow to email or upload."
)
def _validate_pdf(self):
"""Validate PDF presentation."""
if not HAS_PYPDF2:
self.warnings.append(
"PyPDF2 not installed. Install with: pip install PyPDF2"
)
return
try:
with open(self.filepath, 'rb') as f:
reader = PyPDF2.PdfReader(f)
num_pages = len(reader.pages)
self.info.append(f"Number of slides: {num_pages}")
# Check slide count against duration
if self.duration:
self._check_slide_count(num_pages)
# Get page size
first_page = reader.pages[0]
media_box = first_page.mediabox
width = float(media_box.width)
height = float(media_box.height)
# Convert points to inches (72 points = 1 inch)
width_in = width / 72
height_in = height / 72
aspect = width / height
self.info.append(
f"Slide dimensions: {width_in:.1f}\" × {height_in:.1f}\" "
f"(aspect ratio: {aspect:.2f})"
)
# Check common aspect ratios
if abs(aspect - 16/9) < 0.01:
self.info.append("Aspect ratio: 16:9 (widescreen)")
elif abs(aspect - 4/3) < 0.01:
self.info.append("Aspect ratio: 4:3 (standard)")
else:
self.warnings.append(
f"Unusual aspect ratio: {aspect:.2f}. "
"Confirm this matches venue requirements."
)
except Exception as e:
self.issues.append(f"Error reading PDF: {str(e)}")
def _validate_pptx(self):
"""Validate PowerPoint presentation."""
if not HAS_PPTX:
self.warnings.append(
"python-pptx not installed. Install with: pip install python-pptx"
)
return
try:
prs = Presentation(self.filepath)
num_slides = len(prs.slides)
self.info.append(f"Number of slides: {num_slides}")
# Check slide count against duration
if self.duration:
self._check_slide_count(num_slides)
# Get slide dimensions
width_inches = prs.slide_width / 914400 # EMU to inches
height_inches = prs.slide_height / 914400
aspect = prs.slide_width / prs.slide_height
self.info.append(
f"Slide dimensions: {width_inches:.1f}\" × {height_inches:.1f}\" "
f"(aspect ratio: {aspect:.2f})"
)
# Check fonts and text
self._check_pptx_content(prs)
except Exception as e:
self.issues.append(f"Error reading PowerPoint: {str(e)}")
def _check_pptx_content(self, prs):
"""Check PowerPoint content for common issues."""
small_text_slides = []
many_bullets_slides = []
for idx, slide in enumerate(prs.slides, start=1):
for shape in slide.shapes:
if not shape.has_text_frame:
continue
text_frame = shape.text_frame
# Check for small fonts
for paragraph in text_frame.paragraphs:
for run in paragraph.runs:
if run.font.size and run.font.size.pt < 18:
small_text_slides.append(idx)
break
# Check for too many bullets
bullet_count = sum(1 for p in text_frame.paragraphs if p.level == 0)
if bullet_count > 6:
many_bullets_slides.append(idx)
# Report issues
if small_text_slides:
unique_slides = sorted(set(small_text_slides))
self.warnings.append(
f"Small text (<18pt) found on slides: {unique_slides[:5]}"
+ (" ..." if len(unique_slides) > 5 else "")
)
if many_bullets_slides:
unique_slides = sorted(set(many_bullets_slides))
self.warnings.append(
f"Many bullets (>6) on slides: {unique_slides[:5]}"
+ (" ..." if len(unique_slides) > 5 else "")
)
def _validate_latex(self):
"""Validate LaTeX Beamer presentation."""
self.info.append("LaTeX source file detected")
# Try to compile
if self._try_compile_latex():
self.info.append("LaTeX compilation: SUCCESS")
# If PDF was generated, validate it
pdf_path = self.filepath.with_suffix('.pdf')
if pdf_path.exists():
pdf_validator = PresentationValidator(str(pdf_path), self.duration)
pdf_results = pdf_validator.validate()
# Merge results
self.info.extend(pdf_results['info'])
self.warnings.extend(pdf_results['warnings'])
self.issues.extend(pdf_results['issues'])
else:
self.issues.append(
"LaTeX compilation failed. Check .log file for errors."
)
def _try_compile_latex(self) -> bool:
"""Try to compile LaTeX file."""
try:
# Try pdflatex
result = subprocess.run(
['pdflatex', '-interaction=nonstopmode', self.filepath.name],
cwd=self.filepath.parent,
capture_output=True,
timeout=60
)
return result.returncode == 0
except (subprocess.TimeoutExpired, FileNotFoundError):
return False
def _check_slide_count(self, num_slides: int):
"""Check if slide count is appropriate for duration."""
if self.duration not in self.SLIDE_GUIDELINES:
# Find nearest duration
durations = sorted(self.SLIDE_GUIDELINES.keys())
nearest = min(durations, key=lambda x: abs(x - self.duration))
min_slides, rec_slides, max_slides = self.SLIDE_GUIDELINES[nearest]
self.info.append(
f"Using guidelines for {nearest}-minute talk "
f"(closest to {self.duration} minutes)"
)
else:
min_slides, rec_slides, max_slides = self.SLIDE_GUIDELINES[self.duration]
self.info.append(
f"Recommended slides for {self.duration}-minute talk: "
f"{min_slides}-{max_slides} (optimal: ~{rec_slides})"
)
if num_slides < min_slides:
self.warnings.append(
f"Fewer slides ({num_slides}) than recommended ({min_slides}-{max_slides}). "
"May have too much time or too little content."
)
elif num_slides > max_slides:
self.warnings.append(
f"More slides ({num_slides}) than recommended ({min_slides}-{max_slides}). "
"Likely to run over time."
)
else:
self.info.append(
f"Slide count ({num_slides}) is within recommended range."
)
def _format_results(self) -> Dict:
"""Format validation results."""
return {
'filepath': str(self.filepath),
'file_type': self.file_type,
'info': self.info,
'warnings': self.warnings,
'issues': self.issues,
'valid': len(self.issues) == 0
}
def print_results(results: Dict):
"""Print validation results in a readable format."""
print()
print("=" * 60)
print("VALIDATION RESULTS")
print("=" * 60)
# Print info
if results['info']:
print("\n📋 Information:")
for item in results['info']:
print(f"{item}")
# Print warnings
if results['warnings']:
print("\n⚠️ Warnings:")
for item in results['warnings']:
print(f"{item}")
# Print issues
if results['issues']:
print("\n❌ Issues:")
for item in results['issues']:
print(f"{item}")
# Overall status
print("\n" + "=" * 60)
if results['valid']:
print("✅ Validation PASSED")
if results['warnings']:
print(f" ({len(results['warnings'])} warning(s) found)")
else:
print("❌ Validation FAILED")
print(f" ({len(results['issues'])} issue(s) found)")
print("=" * 60)
def main():
parser = argparse.ArgumentParser(
description='Validate scientific presentations',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s presentation.pdf --duration 15
%(prog)s slides.pptx --duration 45
%(prog)s beamer_talk.tex --duration 20
Supported file types:
- PDF (.pdf)
- PowerPoint (.pptx, .ppt)
- LaTeX Beamer (.tex)
Validation checks:
- Slide count vs. duration
- File size
- Slide dimensions
- Font sizes (PowerPoint)
- LaTeX compilation (Beamer)
"""
)
parser.add_argument(
'filepath',
help='Path to presentation file (PDF, PPTX, or TEX)'
)
parser.add_argument(
'--duration', '-d',
type=int,
help='Presentation duration in minutes'
)
parser.add_argument(
'--quiet', '-q',
action='store_true',
help='Only show issues and warnings'
)
args = parser.parse_args()
# Validate
validator = PresentationValidator(args.filepath, args.duration)
results = validator.validate()
# Print results
if args.quiet:
# Only show warnings and issues
if results['warnings'] or results['issues']:
print_results(results)
else:
print("✅ No issues found")
else:
print_results(results)
# Exit with appropriate code
sys.exit(0 if results['valid'] else 1)
if __name__ == '__main__':
main()