commit 993733823b2d16c3b9edb2f8aeca11dcdc044559 Author: dfty Date: Thu Jan 29 22:15:04 2026 +0800 Initial commit for markitdown diff --git a/INSTALLATION_GUIDE.md b/INSTALLATION_GUIDE.md new file mode 100644 index 0000000..4bd1fc1 --- /dev/null +++ b/INSTALLATION_GUIDE.md @@ -0,0 +1,318 @@ +# MarkItDown Installation Guide + +## Prerequisites + +- Python 3.10 or higher +- pip package manager +- Virtual environment (recommended) + +## Basic Installation + +### Install All Features (Recommended) + +```bash +pip install 'markitdown[all]' +``` + +This installs support for all file formats and features. + +### Install Specific Features + +If you only need certain file formats, you can install specific dependencies: + +```bash +# PDF support only +pip install 'markitdown[pdf]' + +# Office documents +pip install 'markitdown[docx,pptx,xlsx]' + +# Multiple formats +pip install 'markitdown[pdf,docx,pptx,xlsx,audio-transcription]' +``` + +### Install from Source + +```bash +git clone https://github.com/microsoft/markitdown.git +cd markitdown +pip install -e 'packages/markitdown[all]' +``` + +## Optional Dependencies + +| Feature | Installation | Use Case | +|---------|--------------|----------| +| All formats | `pip install 'markitdown[all]'` | Everything | +| PDF | `pip install 'markitdown[pdf]'` | PDF documents | +| Word | `pip install 'markitdown[docx]'` | DOCX files | +| PowerPoint | `pip install 'markitdown[pptx]'` | PPTX files | +| Excel (new) | `pip install 'markitdown[xlsx]'` | XLSX files | +| Excel (old) | `pip install 'markitdown[xls]'` | XLS files | +| Outlook | `pip install 'markitdown[outlook]'` | MSG files | +| Azure DI | `pip install 'markitdown[az-doc-intel]'` | Enhanced PDF | +| Audio | `pip install 'markitdown[audio-transcription]'` | WAV/MP3 | +| YouTube | `pip install 'markitdown[youtube-transcription]'` | YouTube videos | + +## System Dependencies + +### OCR Support (for scanned documents and images) + +#### macOS +```bash +brew install tesseract +``` + +#### Ubuntu/Debian +```bash +sudo apt-get update +sudo apt-get install tesseract-ocr +``` + +#### Windows +Download from: https://github.com/UB-Mannheim/tesseract/wiki + +### Poppler Utils (for advanced PDF operations) + +#### macOS +```bash +brew install poppler +``` + +#### Ubuntu/Debian +```bash +sudo apt-get install poppler-utils +``` + +## Verification + +Test your installation: + +```bash +# Check version +python -c "import markitdown; print('MarkItDown installed successfully')" + +# Test basic conversion +echo "Test" > test.txt +markitdown test.txt +rm test.txt +``` + +## Virtual Environment Setup + +### Using venv + +```bash +# Create virtual environment +python -m venv markitdown-env + +# Activate (macOS/Linux) +source markitdown-env/bin/activate + +# Activate (Windows) +markitdown-env\Scripts\activate + +# Install +pip install 'markitdown[all]' +``` + +### Using conda + +```bash +# Create environment +conda create -n markitdown python=3.12 + +# Activate +conda activate markitdown + +# Install +pip install 'markitdown[all]' +``` + +### Using uv + +```bash +# Create virtual environment +uv venv --python=3.12 .venv + +# Activate +source .venv/bin/activate + +# Install +uv pip install 'markitdown[all]' +``` + +## AI Enhancement Setup (Optional) + +For AI-powered image descriptions using OpenRouter: + +### OpenRouter API + +OpenRouter provides unified access to multiple AI models (GPT-4, Claude, Gemini, etc.) through a single API. + +```bash +# Install OpenAI SDK (required, already included with markitdown) +pip install openai + +# Get API key from https://openrouter.ai/keys + +# Set API key +export OPENROUTER_API_KEY="sk-or-v1-..." + +# Add to shell profile for persistence +echo 'export OPENROUTER_API_KEY="sk-or-v1-..."' >> ~/.bashrc # Linux +echo 'export OPENROUTER_API_KEY="sk-or-v1-..."' >> ~/.zshrc # macOS +``` + +**Why OpenRouter?** +- Access to 100+ AI models through one API +- Choose between GPT-4, Claude, Gemini, and more +- Competitive pricing +- No vendor lock-in +- Simple OpenAI-compatible interface + +**Popular Models for Image Description:** +- `anthropic/claude-sonnet-4.5` - **Recommended** - Best for scientific vision +- `anthropic/claude-opus-4.5` - Excellent technical analysis +- `openai/gpt-4o` - Good vision understanding +- `google/gemini-pro-vision` - Cost-effective option + +See https://openrouter.ai/models for complete model list and pricing. + +## Azure Document Intelligence Setup (Optional) + +For enhanced PDF conversion: + +1. Create Azure Document Intelligence resource in Azure Portal +2. Get endpoint and key +3. Set environment variables: + +```bash +export AZURE_DOCUMENT_INTELLIGENCE_KEY="your-key" +export AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT="https://your-endpoint.cognitiveservices.azure.com/" +``` + +## Docker Installation (Alternative) + +```bash +# Clone repository +git clone https://github.com/microsoft/markitdown.git +cd markitdown + +# Build image +docker build -t markitdown:latest . + +# Run +docker run --rm -i markitdown:latest < input.pdf > output.md +``` + +## Troubleshooting + +### Import Error +``` +ModuleNotFoundError: No module named 'markitdown' +``` + +**Solution**: Ensure you're in the correct virtual environment and markitdown is installed: +```bash +pip install 'markitdown[all]' +``` + +### Missing Feature +``` +Error: PDF conversion not supported +``` + +**Solution**: Install the specific feature: +```bash +pip install 'markitdown[pdf]' +``` + +### OCR Not Working + +**Solution**: Install Tesseract OCR (see System Dependencies above) + +### Permission Errors + +**Solution**: Use virtual environment or install with `--user` flag: +```bash +pip install --user 'markitdown[all]' +``` + +## Upgrading + +```bash +# Upgrade to latest version +pip install --upgrade 'markitdown[all]' + +# Check version +pip show markitdown +``` + +## Uninstallation + +```bash +pip uninstall markitdown +``` + +## Next Steps + +After installation: +1. Read `QUICK_REFERENCE.md` for basic usage +2. See `SKILL.md` for comprehensive guide +3. Try example scripts in `scripts/` directory +4. Check `assets/example_usage.md` for practical examples + +## Skill Scripts Setup + +To use the skill scripts: + +```bash +# Navigate to scripts directory +cd /Users/vinayak/Documents/claude-scientific-writer/.claude/skills/markitdown/scripts + +# Scripts are already executable, just run them +python batch_convert.py --help +python convert_with_ai.py --help +python convert_literature.py --help +``` + +## Testing Installation + +Create a test file to verify everything works: + +```python +# test_markitdown.py +from markitdown import MarkItDown + +def test_basic(): + md = MarkItDown() + # Create a simple test file + with open("test.txt", "w") as f: + f.write("Hello MarkItDown!") + + # Convert it + result = md.convert("test.txt") + print("✓ Basic conversion works") + print(result.text_content) + + # Cleanup + import os + os.remove("test.txt") + +if __name__ == "__main__": + test_basic() +``` + +Run it: +```bash +python test_markitdown.py +``` + +## Getting Help + +- **Documentation**: See `SKILL.md` and `README.md` +- **GitHub Issues**: https://github.com/microsoft/markitdown/issues +- **Examples**: `assets/example_usage.md` +- **API Reference**: `references/api_reference.md` + diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..72196cb --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,22 @@ +MIT License + +Copyright (c) Microsoft Corporation. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/OPENROUTER_INTEGRATION.md b/OPENROUTER_INTEGRATION.md new file mode 100644 index 0000000..f15af23 --- /dev/null +++ b/OPENROUTER_INTEGRATION.md @@ -0,0 +1,359 @@ +# OpenRouter Integration for MarkItDown + +## Overview + +This MarkItDown skill has been configured to use **OpenRouter** instead of direct OpenAI API access. OpenRouter provides a unified API gateway to access 100+ AI models from different providers through a single, OpenAI-compatible interface. + +## Why OpenRouter? + +### Benefits + +1. **Multiple Model Access**: Access GPT-4, Claude, Gemini, and 100+ other models through one API +2. **No Vendor Lock-in**: Switch between models without code changes +3. **Competitive Pricing**: Often better rates than going direct +4. **Simple Migration**: OpenAI-compatible API means minimal code changes +5. **Flexible Choice**: Choose the best model for each task + +### Popular Models for Image Description + +| Model | Provider | Use Case | Vision Support | +|-------|----------|----------|----------------| +| `anthropic/claude-sonnet-4.5` | Anthropic | **Recommended** - Best overall for scientific analysis | ✅ | +| `anthropic/claude-opus-4.5` | Anthropic | Excellent technical analysis | ✅ | +| `openai/gpt-4o` | OpenAI | Strong vision understanding | ✅ | +| `openai/gpt-4-vision` | OpenAI | GPT-4 with vision | ✅ | +| `google/gemini-pro-vision` | Google | Cost-effective option | ✅ | + +See https://openrouter.ai/models for the complete list. + +## Getting Started + +### 1. Get an API Key + +1. Visit https://openrouter.ai/keys +2. Sign up or log in +3. Create a new API key +4. Copy the key (starts with `sk-or-v1-...`) + +### 2. Set Environment Variable + +```bash +# Add to your environment +export OPENROUTER_API_KEY="sk-or-v1-..." + +# Make it permanent +echo 'export OPENROUTER_API_KEY="sk-or-v1-..."' >> ~/.zshrc # macOS +echo 'export OPENROUTER_API_KEY="sk-or-v1-..."' >> ~/.bashrc # Linux + +# Reload shell +source ~/.zshrc # or source ~/.bashrc +``` + +### 3. Use in Python + +```python +from markitdown import MarkItDown +from openai import OpenAI + +# Initialize OpenRouter client (OpenAI-compatible) +client = OpenAI( + api_key="your-openrouter-api-key", # or use env var + base_url="https://openrouter.ai/api/v1" +) + +# Create MarkItDown with AI support +md = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5" # Choose your model +) + +# Convert with AI-enhanced descriptions +result = md.convert("presentation.pptx") +print(result.text_content) +``` + +## Using the Scripts + +All skill scripts have been updated to use OpenRouter: + +### convert_with_ai.py + +```bash +# Set API key +export OPENROUTER_API_KEY="sk-or-v1-..." + +# Convert with default model (advanced vision model) +python scripts/convert_with_ai.py paper.pdf output.md --prompt-type scientific + +# Use GPT-4o as alternative +python scripts/convert_with_ai.py paper.pdf output.md \ + --model openai/gpt-4o \ + --prompt-type scientific + +# Use Gemini Pro Vision (cost-effective) +python scripts/convert_with_ai.py slides.pptx output.md \ + --model google/gemini-pro-vision \ + --prompt-type presentation + +# List available prompt types +python scripts/convert_with_ai.py --list-prompts +``` + +### Choosing the Right Model + +```bash +# For scientific papers - use advanced vision model for technical analysis +python scripts/convert_with_ai.py research.pdf output.md \ + --model anthropic/claude-sonnet-4.5 \ + --prompt-type scientific + +# For presentations - use advanced vision model +python scripts/convert_with_ai.py slides.pptx output.md \ + --model anthropic/claude-sonnet-4.5 \ + --prompt-type presentation + +# For data visualizations - use advanced vision model +python scripts/convert_with_ai.py charts.pdf output.md \ + --model anthropic/claude-sonnet-4.5 \ + --prompt-type data_viz + +# For medical images - use advanced vision model for detailed analysis +python scripts/convert_with_ai.py xray.jpg output.md \ + --model anthropic/claude-sonnet-4.5 \ + --prompt-type medical +``` + +## Code Examples + +### Basic Usage + +```python +from markitdown import MarkItDown +from openai import OpenAI +import os + +# Initialize OpenRouter client +client = OpenAI( + api_key=os.environ.get("OPENROUTER_API_KEY"), + base_url="https://openrouter.ai/api/v1" +) + +# Use advanced vision model for image descriptions +md = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5" +) + +result = md.convert("document.pptx") +print(result.text_content) +``` + +### Switching Models Dynamically + +```python +from markitdown import MarkItDown +from openai import OpenAI +import os + +client = OpenAI( + api_key=os.environ["OPENROUTER_API_KEY"], + base_url="https://openrouter.ai/api/v1" +) + +# Use different models for different file types +def convert_with_best_model(filepath): + if filepath.endswith('.pdf'): + # Use advanced vision model for technical PDFs + md = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5", + llm_prompt="Describe scientific figures with technical precision" + ) + elif filepath.endswith('.pptx'): + # Use advanced vision model for presentations + md = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5", + llm_prompt="Describe slide content and visual elements" + ) + else: + # Use advanced vision model as default + md = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5" + ) + + return md.convert(filepath) + +# Use it +result = convert_with_best_model("paper.pdf") +``` + +### Custom Prompts per Model + +```python +from markitdown import MarkItDown +from openai import OpenAI + +client = OpenAI( + api_key="your-openrouter-api-key", + base_url="https://openrouter.ai/api/v1" +) + +# Scientific analysis with advanced vision model +scientific_prompt = """ +Analyze this scientific figure. Provide: +1. Type of visualization and methodology +2. Quantitative data points and trends +3. Statistical significance +4. Technical interpretation +Be precise and use scientific terminology. +""" + +md_scientific = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5", + llm_prompt=scientific_prompt +) + +# Visual analysis with advanced vision model +visual_prompt = """ +Describe this image comprehensively: +1. Main visual elements and composition +2. Colors, layout, and design +3. Text and labels +4. Overall message +""" + +md_visual = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5", + llm_prompt=visual_prompt +) +``` + +## Model Comparison + +### For Scientific Content + +**Recommended: anthropic/claude-sonnet-4.5** +- Excellent at technical analysis +- Superior reasoning capabilities +- Best at understanding scientific figures +- Most detailed and accurate explanations +- Advanced vision capabilities + +**Alternative: openai/gpt-4o** +- Good vision understanding +- Fast processing +- Good at charts and graphs + +### For Presentations + +**Recommended: anthropic/claude-sonnet-4.5** +- Superior vision capabilities +- Excellent at understanding slide layouts +- Fast and reliable +- Best technical comprehension + +### For Cost-Effectiveness + +**Recommended: google/gemini-pro-vision** +- Lower cost per request +- Good quality +- Fast processing + +## Pricing Considerations + +OpenRouter pricing varies by model. Check current rates at https://openrouter.ai/models + +**Tips for Cost Optimization:** +1. Use advanced vision models for best quality on complex scientific content +2. Use cheaper models (Gemini) for simple images +3. Batch process similar content with the same model +4. Use appropriate prompts to get better results in fewer retries + +## Troubleshooting + +### API Key Issues + +```bash +# Check if key is set +echo $OPENROUTER_API_KEY + +# Should show: sk-or-v1-... +# If empty, set it: +export OPENROUTER_API_KEY="sk-or-v1-..." +``` + +### Model Not Found + +If you get a "model not found" error, check: +1. Model name format: `provider/model-name` +2. Model availability: https://openrouter.ai/models +3. Vision support: Ensure model supports vision for image description + +### Rate Limits + +OpenRouter has rate limits. If you hit them: +1. Add delays between requests +2. Use batch processing scripts with `--workers` parameter +3. Consider upgrading your OpenRouter plan + +## Migration Notes + +This skill was updated from direct OpenAI API to OpenRouter. Key changes: + +1. **Environment Variable**: `OPENAI_API_KEY` → `OPENROUTER_API_KEY` +2. **Client Initialization**: Added `base_url="https://openrouter.ai/api/v1"` +3. **Model Names**: `gpt-4o` → `openai/gpt-4o` (with provider prefix) +4. **Script Updates**: All scripts now use OpenRouter by default + +## Resources + +- **OpenRouter Website**: https://openrouter.ai +- **Get API Keys**: https://openrouter.ai/keys +- **Model List**: https://openrouter.ai/models +- **Pricing**: https://openrouter.ai/models (click on model for details) +- **Documentation**: https://openrouter.ai/docs +- **Support**: https://openrouter.ai/discord + +## Example Workflow + +Here's a complete workflow using OpenRouter: + +```bash +# 1. Set up API key +export OPENROUTER_API_KEY="sk-or-v1-your-key-here" + +# 2. Convert a scientific paper with Claude +python scripts/convert_with_ai.py \ + research_paper.pdf \ + output.md \ + --model anthropic/claude-opus-4.5 \ + --prompt-type scientific + +# 3. Convert presentation with GPT-4o +python scripts/convert_with_ai.py \ + talk_slides.pptx \ + slides.md \ + --model openai/gpt-4o \ + --prompt-type presentation + +# 4. Batch convert with cost-effective model +python scripts/batch_convert.py \ + images/ \ + markdown_output/ \ + --extensions .jpg .png +``` + +## Support + +For OpenRouter-specific issues: +- Discord: https://openrouter.ai/discord +- Email: support@openrouter.ai + +For MarkItDown skill issues: +- Check documentation in this skill directory +- Review examples in `assets/example_usage.md` + diff --git a/QUICK_REFERENCE.md b/QUICK_REFERENCE.md new file mode 100644 index 0000000..09e2dc8 --- /dev/null +++ b/QUICK_REFERENCE.md @@ -0,0 +1,309 @@ +# MarkItDown Quick Reference + +## Installation + +```bash +# All features +pip install 'markitdown[all]' + +# Specific formats +pip install 'markitdown[pdf,docx,pptx,xlsx]' +``` + +## Basic Usage + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("file.pdf") +print(result.text_content) +``` + +## Command Line + +```bash +# Simple conversion +markitdown input.pdf > output.md +markitdown input.pdf -o output.md + +# With plugins +markitdown --use-plugins file.pdf -o output.md +``` + +## Common Tasks + +### Convert PDF +```python +md = MarkItDown() +result = md.convert("paper.pdf") +``` + +### Convert with AI +```python +from openai import OpenAI + +# Use OpenRouter for multiple model access +client = OpenAI( + api_key="your-openrouter-api-key", + base_url="https://openrouter.ai/api/v1" +) + +md = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5" # recommended for vision +) +result = md.convert("slides.pptx") +``` + +### Batch Convert +```bash +python scripts/batch_convert.py input/ output/ --extensions .pdf .docx +``` + +### Literature Conversion +```bash +python scripts/convert_literature.py papers/ markdown/ --create-index +``` + +## Supported Formats + +| Format | Extension | Notes | +|--------|-----------|-------| +| PDF | `.pdf` | Full text + OCR | +| Word | `.docx` | Tables, formatting | +| PowerPoint | `.pptx` | Slides + notes | +| Excel | `.xlsx`, `.xls` | Tables | +| Images | `.jpg`, `.png`, `.gif`, `.webp` | EXIF + OCR | +| Audio | `.wav`, `.mp3` | Transcription | +| HTML | `.html`, `.htm` | Clean conversion | +| Data | `.csv`, `.json`, `.xml` | Structured | +| Archives | `.zip` | Iterates contents | +| E-books | `.epub` | Full text | +| YouTube | URLs | Transcripts | + +## Optional Dependencies + +```bash +[all] # All features +[pdf] # PDF support +[docx] # Word documents +[pptx] # PowerPoint +[xlsx] # Excel +[xls] # Old Excel +[outlook] # Outlook messages +[az-doc-intel] # Azure Document Intelligence +[audio-transcription] # Audio files +[youtube-transcription] # YouTube videos +``` + +## AI-Enhanced Conversion + +### Scientific Papers +```python +from openai import OpenAI + +# Initialize OpenRouter client +client = OpenAI( + api_key="your-openrouter-api-key", + base_url="https://openrouter.ai/api/v1" +) + +md = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5", # recommended for scientific vision + llm_prompt="Describe scientific figures with technical precision" +) +result = md.convert("paper.pdf") +``` + +### Custom Prompts +```python +prompt = """ +Analyze this data visualization. Describe: +- Type of chart/graph +- Key trends and patterns +- Notable data points +""" + +md = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5", + llm_prompt=prompt +) +``` + +### Available Models via OpenRouter +- `anthropic/claude-sonnet-4.5` - **Recommended for scientific vision** +- `anthropic/claude-opus-4.5` - Advanced vision model +- `openai/gpt-4o` - GPT-4 Omni (vision) +- `openai/gpt-4-vision` - GPT-4 Vision +- `google/gemini-pro-vision` - Gemini Pro Vision + +See https://openrouter.ai/models for full list + +## Azure Document Intelligence + +```python +md = MarkItDown(docintel_endpoint="https://YOUR-ENDPOINT.cognitiveservices.azure.com/") +result = md.convert("complex_layout.pdf") +``` + +## Batch Processing + +### Python +```python +from markitdown import MarkItDown +from pathlib import Path + +md = MarkItDown() + +for file in Path("input/").glob("*.pdf"): + result = md.convert(str(file)) + output = Path("output") / f"{file.stem}.md" + output.write_text(result.text_content) +``` + +### Script +```bash +# Parallel conversion +python scripts/batch_convert.py input/ output/ --workers 8 + +# Recursive +python scripts/batch_convert.py input/ output/ -r +``` + +## Error Handling + +```python +try: + result = md.convert("file.pdf") +except FileNotFoundError: + print("File not found") +except Exception as e: + print(f"Error: {e}") +``` + +## Streaming + +```python +with open("large_file.pdf", "rb") as f: + result = md.convert_stream(f, file_extension=".pdf") +``` + +## Common Prompts + +### Scientific +``` +Analyze this scientific figure. Describe: +- Type of visualization +- Key data points and trends +- Axes, labels, and legends +- Scientific significance +``` + +### Medical +``` +Describe this medical image. Include: +- Type of imaging (X-ray, MRI, CT, etc.) +- Anatomical structures visible +- Notable findings +- Clinical relevance +``` + +### Data Visualization +``` +Analyze this data visualization: +- Chart type +- Variables and axes +- Data ranges +- Key patterns and outliers +``` + +## Performance Tips + +1. **Reuse instance**: Create once, use many times +2. **Parallel processing**: Use ThreadPoolExecutor for multiple files +3. **Stream large files**: Use `convert_stream()` for big files +4. **Choose right format**: Install only needed dependencies + +## Environment Variables + +```bash +# OpenRouter for AI-enhanced conversions +export OPENROUTER_API_KEY="sk-or-v1-..." + +# Azure Document Intelligence (optional) +export AZURE_DOCUMENT_INTELLIGENCE_KEY="key..." +export AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT="https://..." +``` + +## Scripts Quick Reference + +### batch_convert.py +```bash +python scripts/batch_convert.py INPUT OUTPUT [OPTIONS] + +Options: + --extensions .pdf .docx File types to convert + --recursive, -r Search subdirectories + --workers 4 Parallel workers + --verbose, -v Detailed output + --plugins, -p Enable plugins +``` + +### convert_with_ai.py +```bash +python scripts/convert_with_ai.py INPUT OUTPUT [OPTIONS] + +Options: + --api-key KEY OpenRouter API key + --model MODEL Model name (default: anthropic/claude-sonnet-4.5) + --prompt-type TYPE Preset prompt (scientific, medical, etc.) + --custom-prompt TEXT Custom prompt + --list-prompts Show available prompts +``` + +### convert_literature.py +```bash +python scripts/convert_literature.py INPUT OUTPUT [OPTIONS] + +Options: + --organize-by-year, -y Organize by year + --create-index, -i Create index file + --recursive, -r Search subdirectories +``` + +## Troubleshooting + +### Missing Dependencies +```bash +pip install 'markitdown[pdf]' # Install PDF support +``` + +### Binary File Error +```python +# Wrong +with open("file.pdf", "r") as f: + +# Correct +with open("file.pdf", "rb") as f: # Binary mode +``` + +### OCR Not Working +```bash +# macOS +brew install tesseract + +# Ubuntu +sudo apt-get install tesseract-ocr +``` + +## More Information + +- **Full Documentation**: See `SKILL.md` +- **API Reference**: See `references/api_reference.md` +- **Format Details**: See `references/file_formats.md` +- **Examples**: See `assets/example_usage.md` +- **GitHub**: https://github.com/microsoft/markitdown + diff --git a/README.md b/README.md new file mode 100644 index 0000000..9769486 --- /dev/null +++ b/README.md @@ -0,0 +1,184 @@ +# MarkItDown Skill + +This skill provides comprehensive support for converting various file formats to Markdown using Microsoft's MarkItDown tool. + +## Overview + +MarkItDown is a Python tool that converts files and office documents to Markdown format. This skill includes: + +- Complete API documentation +- Format-specific conversion guides +- Utility scripts for batch processing +- AI-enhanced conversion examples +- Integration with scientific workflows + +## Contents + +### Main Skill File +- **SKILL.md** - Complete guide to using MarkItDown with quick start, examples, and best practices + +### References +- **api_reference.md** - Detailed API documentation, class references, and method signatures +- **file_formats.md** - Format-specific details for all supported file types + +### Scripts +- **batch_convert.py** - Batch convert multiple files with parallel processing +- **convert_with_ai.py** - AI-enhanced conversion with custom prompts +- **convert_literature.py** - Scientific literature conversion with metadata extraction + +### Assets +- **example_usage.md** - Practical examples for common use cases + +## Installation + +```bash +# Install with all features +pip install 'markitdown[all]' + +# Or install specific features +pip install 'markitdown[pdf,docx,pptx,xlsx]' +``` + +## Quick Start + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("document.pdf") +print(result.text_content) +``` + +## Supported Formats + +- **Documents**: PDF, DOCX, PPTX, XLSX, EPUB +- **Images**: JPEG, PNG, GIF, WebP (with OCR) +- **Audio**: WAV, MP3 (with transcription) +- **Web**: HTML, YouTube URLs +- **Data**: CSV, JSON, XML +- **Archives**: ZIP files + +## Key Features + +### 1. AI-Enhanced Conversions +Use AI models via OpenRouter to generate detailed image descriptions: + +```python +from openai import OpenAI + +# OpenRouter provides access to 100+ AI models +client = OpenAI( + api_key="your-openrouter-api-key", + base_url="https://openrouter.ai/api/v1" +) + +md = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5" # recommended for vision +) +result = md.convert("presentation.pptx") +``` + +### 2. Batch Processing +Convert multiple files efficiently: + +```bash +python scripts/batch_convert.py papers/ output/ --extensions .pdf .docx +``` + +### 3. Scientific Literature +Convert and organize research papers: + +```bash +python scripts/convert_literature.py papers/ output/ --organize-by-year --create-index +``` + +### 4. Azure Document Intelligence +Enhanced PDF conversion with Microsoft Document Intelligence: + +```python +md = MarkItDown(docintel_endpoint="https://YOUR-ENDPOINT.cognitiveservices.azure.com/") +result = md.convert("complex_document.pdf") +``` + +## Use Cases + +### Literature Review +Convert research papers to Markdown for easier analysis and note-taking. + +### Data Extraction +Extract tables from Excel files into Markdown format. + +### Presentation Processing +Convert PowerPoint slides with AI-generated descriptions. + +### Document Analysis +Process documents for LLM consumption with token-efficient Markdown. + +### YouTube Transcripts +Fetch and convert YouTube video transcriptions. + +## Scripts Usage + +### Batch Convert +```bash +# Convert all PDFs in a directory +python scripts/batch_convert.py input_dir/ output_dir/ --extensions .pdf + +# Recursive with multiple formats +python scripts/batch_convert.py docs/ markdown/ --extensions .pdf .docx .pptx -r +``` + +### AI-Enhanced Conversion +```bash +# Convert with AI descriptions via OpenRouter +export OPENROUTER_API_KEY="sk-or-v1-..." +python scripts/convert_with_ai.py paper.pdf output.md --prompt-type scientific + +# Use different models +python scripts/convert_with_ai.py image.png output.md --model anthropic/claude-sonnet-4.5 + +# Use custom prompt +python scripts/convert_with_ai.py image.png output.md --custom-prompt "Describe this diagram" +``` + +### Literature Conversion +```bash +# Convert papers with metadata extraction +python scripts/convert_literature.py papers/ markdown/ --organize-by-year --create-index +``` + +## Integration with Scientific Writer + +This skill integrates seamlessly with the Scientific Writer CLI for: +- Converting source materials for paper writing +- Processing literature for reviews +- Extracting data from various document formats +- Preparing documents for LLM analysis + +## Resources + +- **MarkItDown GitHub**: https://github.com/microsoft/markitdown +- **PyPI**: https://pypi.org/project/markitdown/ +- **OpenRouter**: https://openrouter.ai (AI model access) +- **OpenRouter API Keys**: https://openrouter.ai/keys +- **OpenRouter Models**: https://openrouter.ai/models +- **License**: MIT + +## Requirements + +- Python 3.10+ +- Optional dependencies based on formats needed +- OpenRouter API key (for AI-enhanced conversions) - Get at https://openrouter.ai/keys +- Azure subscription (optional, for Document Intelligence) + +## Examples + +See `assets/example_usage.md` for comprehensive examples covering: +- Basic conversions +- Scientific workflows +- AI-enhanced processing +- Batch operations +- Error handling +- Integration patterns + diff --git a/SKILL.md b/SKILL.md new file mode 100644 index 0000000..3ad7f94 --- /dev/null +++ b/SKILL.md @@ -0,0 +1,486 @@ +--- +name: markitdown +description: "Convert files and office documents to Markdown. Supports PDF, DOCX, PPTX, XLSX, images (with OCR), audio (with transcription), HTML, CSV, JSON, XML, ZIP, YouTube URLs, EPubs and more." +allowed-tools: [Read, Write, Edit, Bash] +license: MIT +source: https://github.com/microsoft/markitdown +--- + +# MarkItDown - File to Markdown Conversion + +## Overview + +MarkItDown is a Python tool developed by Microsoft for converting various file formats to Markdown. It's particularly useful for converting documents into LLM-friendly text format, as Markdown is token-efficient and well-understood by modern language models. + +**Key Benefits**: +- Convert documents to clean, structured Markdown +- Token-efficient format for LLM processing +- Supports 15+ file formats +- Optional AI-enhanced image descriptions +- OCR for images and scanned documents +- Speech transcription for audio files + +## Visual Enhancement with Scientific Schematics + +**When creating documents with this skill, always consider adding scientific diagrams and schematics to enhance visual communication.** + +If your document does not already contain schematics or diagrams: +- Use the **scientific-schematics** skill to generate AI-powered publication-quality diagrams +- Simply describe your desired diagram in natural language +- Nano Banana Pro will automatically generate, review, and refine the schematic + +**For new documents:** Scientific schematics should be generated by default to visually represent key concepts, workflows, architectures, or relationships described in the text. + +**How to generate schematics:** +```bash +python scripts/generate_schematic.py "your diagram description" -o figures/output.png +``` + +The AI will automatically: +- Create publication-quality images with proper formatting +- Review and refine through multiple iterations +- Ensure accessibility (colorblind-friendly, high contrast) +- Save outputs in the figures/ directory + +**When to add schematics:** +- Document conversion workflow diagrams +- File format architecture illustrations +- OCR processing pipeline diagrams +- Integration workflow visualizations +- System architecture diagrams +- Data flow diagrams +- Any complex concept that benefits from visualization + +For detailed guidance on creating schematics, refer to the scientific-schematics skill documentation. + +--- + +## Supported Formats + +| Format | Description | Notes | +|--------|-------------|-------| +| **PDF** | Portable Document Format | Full text extraction | +| **DOCX** | Microsoft Word | Tables, formatting preserved | +| **PPTX** | PowerPoint | Slides with notes | +| **XLSX** | Excel spreadsheets | Tables and data | +| **Images** | JPEG, PNG, GIF, WebP | EXIF metadata + OCR | +| **Audio** | WAV, MP3 | Metadata + transcription | +| **HTML** | Web pages | Clean conversion | +| **CSV** | Comma-separated values | Table format | +| **JSON** | JSON data | Structured representation | +| **XML** | XML documents | Structured format | +| **ZIP** | Archive files | Iterates contents | +| **EPUB** | E-books | Full text extraction | +| **YouTube** | Video URLs | Fetch transcriptions | + +## Quick Start + +### Installation + +```bash +# Install with all features +pip install 'markitdown[all]' + +# Or from source +git clone https://github.com/microsoft/markitdown.git +cd markitdown +pip install -e 'packages/markitdown[all]' +``` + +### Command-Line Usage + +```bash +# Basic conversion +markitdown document.pdf > output.md + +# Specify output file +markitdown document.pdf -o output.md + +# Pipe content +cat document.pdf | markitdown > output.md + +# Enable plugins +markitdown --list-plugins # List available plugins +markitdown --use-plugins document.pdf -o output.md +``` + +### Python API + +```python +from markitdown import MarkItDown + +# Basic usage +md = MarkItDown() +result = md.convert("document.pdf") +print(result.text_content) + +# Convert from stream +with open("document.pdf", "rb") as f: + result = md.convert_stream(f, file_extension=".pdf") + print(result.text_content) +``` + +## Advanced Features + +### 1. AI-Enhanced Image Descriptions + +Use LLMs via OpenRouter to generate detailed image descriptions (for PPTX and image files): + +```python +from markitdown import MarkItDown +from openai import OpenAI + +# Initialize OpenRouter client (OpenAI-compatible API) +client = OpenAI( + api_key="your-openrouter-api-key", + base_url="https://openrouter.ai/api/v1" +) + +md = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5", # recommended for scientific vision + llm_prompt="Describe this image in detail for scientific documentation" +) + +result = md.convert("presentation.pptx") +print(result.text_content) +``` + +### 2. Azure Document Intelligence + +For enhanced PDF conversion with Microsoft Document Intelligence: + +```bash +# Command line +markitdown document.pdf -o output.md -d -e "" +``` + +```python +# Python API +from markitdown import MarkItDown + +md = MarkItDown(docintel_endpoint="") +result = md.convert("complex_document.pdf") +print(result.text_content) +``` + +### 3. Plugin System + +MarkItDown supports 3rd-party plugins for extending functionality: + +```bash +# List installed plugins +markitdown --list-plugins + +# Enable plugins +markitdown --use-plugins file.pdf -o output.md +``` + +Find plugins on GitHub with hashtag: `#markitdown-plugin` + +## Optional Dependencies + +Control which file formats you support: + +```bash +# Install specific formats +pip install 'markitdown[pdf, docx, pptx]' + +# All available options: +# [all] - All optional dependencies +# [pptx] - PowerPoint files +# [docx] - Word documents +# [xlsx] - Excel spreadsheets +# [xls] - Older Excel files +# [pdf] - PDF documents +# [outlook] - Outlook messages +# [az-doc-intel] - Azure Document Intelligence +# [audio-transcription] - WAV and MP3 transcription +# [youtube-transcription] - YouTube video transcription +``` + +## Common Use Cases + +### 1. Convert Scientific Papers to Markdown + +```python +from markitdown import MarkItDown + +md = MarkItDown() + +# Convert PDF paper +result = md.convert("research_paper.pdf") +with open("paper.md", "w") as f: + f.write(result.text_content) +``` + +### 2. Extract Data from Excel for Analysis + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("data.xlsx") + +# Result will be in Markdown table format +print(result.text_content) +``` + +### 3. Process Multiple Documents + +```python +from markitdown import MarkItDown +import os +from pathlib import Path + +md = MarkItDown() + +# Process all PDFs in a directory +pdf_dir = Path("papers/") +output_dir = Path("markdown_output/") +output_dir.mkdir(exist_ok=True) + +for pdf_file in pdf_dir.glob("*.pdf"): + result = md.convert(str(pdf_file)) + output_file = output_dir / f"{pdf_file.stem}.md" + output_file.write_text(result.text_content) + print(f"Converted: {pdf_file.name}") +``` + +### 4. Convert PowerPoint with AI Descriptions + +```python +from markitdown import MarkItDown +from openai import OpenAI + +# Use OpenRouter for access to multiple AI models +client = OpenAI( + api_key="your-openrouter-api-key", + base_url="https://openrouter.ai/api/v1" +) + +md = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5", # recommended for presentations + llm_prompt="Describe this slide image in detail, focusing on key visual elements and data" +) + +result = md.convert("presentation.pptx") +with open("presentation.md", "w") as f: + f.write(result.text_content) +``` + +### 5. Batch Convert with Different Formats + +```python +from markitdown import MarkItDown +from pathlib import Path + +md = MarkItDown() + +# Files to convert +files = [ + "document.pdf", + "spreadsheet.xlsx", + "presentation.pptx", + "notes.docx" +] + +for file in files: + try: + result = md.convert(file) + output = Path(file).stem + ".md" + with open(output, "w") as f: + f.write(result.text_content) + print(f"✓ Converted {file}") + except Exception as e: + print(f"✗ Error converting {file}: {e}") +``` + +### 6. Extract YouTube Video Transcription + +```python +from markitdown import MarkItDown + +md = MarkItDown() + +# Convert YouTube video to transcript +result = md.convert("https://www.youtube.com/watch?v=VIDEO_ID") +print(result.text_content) +``` + +## Docker Usage + +```bash +# Build image +docker build -t markitdown:latest . + +# Run conversion +docker run --rm -i markitdown:latest < ~/document.pdf > output.md +``` + +## Best Practices + +### 1. Choose the Right Conversion Method + +- **Simple documents**: Use basic `MarkItDown()` +- **Complex PDFs**: Use Azure Document Intelligence +- **Visual content**: Enable AI image descriptions +- **Scanned documents**: Ensure OCR dependencies are installed + +### 2. Handle Errors Gracefully + +```python +from markitdown import MarkItDown + +md = MarkItDown() + +try: + result = md.convert("document.pdf") + print(result.text_content) +except FileNotFoundError: + print("File not found") +except Exception as e: + print(f"Conversion error: {e}") +``` + +### 3. Process Large Files Efficiently + +```python +from markitdown import MarkItDown + +md = MarkItDown() + +# For large files, use streaming +with open("large_file.pdf", "rb") as f: + result = md.convert_stream(f, file_extension=".pdf") + + # Process in chunks or save directly + with open("output.md", "w") as out: + out.write(result.text_content) +``` + +### 4. Optimize for Token Efficiency + +Markdown output is already token-efficient, but you can: +- Remove excessive whitespace +- Consolidate similar sections +- Strip metadata if not needed + +```python +from markitdown import MarkItDown +import re + +md = MarkItDown() +result = md.convert("document.pdf") + +# Clean up extra whitespace +clean_text = re.sub(r'\n{3,}', '\n\n', result.text_content) +clean_text = clean_text.strip() + +print(clean_text) +``` + +## Integration with Scientific Workflows + +### Convert Literature for Review + +```python +from markitdown import MarkItDown +from pathlib import Path + +md = MarkItDown() + +# Convert all papers in literature folder +papers_dir = Path("literature/pdfs") +output_dir = Path("literature/markdown") +output_dir.mkdir(exist_ok=True) + +for paper in papers_dir.glob("*.pdf"): + result = md.convert(str(paper)) + + # Save with metadata + output_file = output_dir / f"{paper.stem}.md" + content = f"# {paper.stem}\n\n" + content += f"**Source**: {paper.name}\n\n" + content += "---\n\n" + content += result.text_content + + output_file.write_text(content) + +# For AI-enhanced conversion with figures +from openai import OpenAI + +client = OpenAI( + api_key="your-openrouter-api-key", + base_url="https://openrouter.ai/api/v1" +) + +md_ai = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5", + llm_prompt="Describe scientific figures with technical precision" +) +``` + +### Extract Tables for Analysis + +```python +from markitdown import MarkItDown +import re + +md = MarkItDown() +result = md.convert("data_tables.xlsx") + +# Markdown tables can be parsed or used directly +print(result.text_content) +``` + +## Troubleshooting + +### Common Issues + +1. **Missing dependencies**: Install feature-specific packages + ```bash + pip install 'markitdown[pdf]' # For PDF support + ``` + +2. **Binary file errors**: Ensure files are opened in binary mode + ```python + with open("file.pdf", "rb") as f: # Note the "rb" + result = md.convert_stream(f, file_extension=".pdf") + ``` + +3. **OCR not working**: Install tesseract + ```bash + # macOS + brew install tesseract + + # Ubuntu + sudo apt-get install tesseract-ocr + ``` + +## Performance Considerations + +- **PDF files**: Large PDFs may take time; consider page ranges if supported +- **Image OCR**: OCR processing is CPU-intensive +- **Audio transcription**: Requires additional compute resources +- **AI image descriptions**: Requires API calls (costs may apply) + +## Next Steps + +- See `references/api_reference.md` for complete API documentation +- Check `references/file_formats.md` for format-specific details +- Review `scripts/batch_convert.py` for automation examples +- Explore `scripts/convert_with_ai.py` for AI-enhanced conversions + +## Resources + +- **MarkItDown GitHub**: https://github.com/microsoft/markitdown +- **PyPI**: https://pypi.org/project/markitdown/ +- **OpenRouter**: https://openrouter.ai (for AI-enhanced conversions) +- **OpenRouter API Keys**: https://openrouter.ai/keys +- **OpenRouter Models**: https://openrouter.ai/models +- **MCP Server**: markitdown-mcp (for Claude Desktop integration) +- **Plugin Development**: See `packages/markitdown-sample-plugin` + diff --git a/SKILL_SUMMARY.md b/SKILL_SUMMARY.md new file mode 100644 index 0000000..33612d3 --- /dev/null +++ b/SKILL_SUMMARY.md @@ -0,0 +1,307 @@ +# MarkItDown Skill - Creation Summary + +## Overview + +A comprehensive skill for using Microsoft's MarkItDown tool has been created for the Claude Scientific Writer. This skill enables conversion of 15+ file formats to Markdown, optimized for LLM processing and scientific workflows. + +## What Was Created + +### Core Documentation + +1. **SKILL.md** (Main skill file) + - Complete guide to MarkItDown + - Quick start examples + - All supported formats + - Advanced features (AI, Azure DI) + - Best practices + - Use cases and examples + +2. **README.md** + - Skill overview + - Key features + - Quick reference + - Integration guide + +3. **QUICK_REFERENCE.md** + - Cheat sheet for common tasks + - Quick syntax reference + - Common commands + - Troubleshooting tips + +4. **INSTALLATION_GUIDE.md** + - Step-by-step installation + - System dependencies + - Virtual environment setup + - Optional features + - Troubleshooting + +### Reference Documentation + +Located in `references/`: + +1. **api_reference.md** + - Complete API documentation + - Class and method references + - Custom converter development + - Plugin system + - Error handling + - Breaking changes guide + +2. **file_formats.md** + - Detailed format-specific guides + - 15+ supported formats + - Format capabilities and limitations + - Best practices per format + - Example outputs + +### Utility Scripts + +Located in `scripts/`: + +1. **batch_convert.py** + - Parallel batch conversion + - Multi-format support + - Recursive directory search + - Progress tracking + - Error reporting + - Command-line interface + +2. **convert_with_ai.py** + - AI-enhanced conversions + - Predefined prompt types (scientific, medical, data viz, etc.) + - Custom prompt support + - Multiple model support + - OpenRouter integration (advanced vision models) + +3. **convert_literature.py** + - Scientific literature conversion + - Metadata extraction from filenames + - Year-based organization + - Automatic index generation + - JSON catalog creation + - Front matter support + +### Assets + +Located in `assets/`: + +1. **example_usage.md** + - 20+ practical examples + - Basic conversions + - Scientific workflows + - AI-enhanced processing + - Batch operations + - Error handling patterns + - Integration examples + +### License + +- **LICENSE.txt** - MIT License from Microsoft + +## Skill Structure + +``` +.claude/skills/markitdown/ +├── SKILL.md # Main skill documentation +├── README.md # Skill overview +├── QUICK_REFERENCE.md # Quick reference guide +├── INSTALLATION_GUIDE.md # Installation instructions +├── SKILL_SUMMARY.md # This file +├── LICENSE.txt # MIT License +├── references/ +│ ├── api_reference.md # Complete API docs +│ └── file_formats.md # Format-specific guides +├── scripts/ +│ ├── batch_convert.py # Batch conversion utility +│ ├── convert_with_ai.py # AI-enhanced conversion +│ └── convert_literature.py # Literature conversion +└── assets/ + └── example_usage.md # Practical examples +``` + +## Capabilities + +### File Format Support + +- **Documents**: PDF, DOCX, PPTX, XLSX, XLS, EPUB +- **Images**: JPEG, PNG, GIF, WebP (with OCR) +- **Audio**: WAV, MP3 (with transcription) +- **Web**: HTML, YouTube URLs +- **Data**: CSV, JSON, XML +- **Archives**: ZIP files +- **Email**: Outlook MSG files + +### Advanced Features + +1. **AI Enhancement via OpenRouter** + - Access to 100+ AI models through OpenRouter + - Multiple preset prompts (scientific, medical, data viz) + - Custom prompt support + - Default: Advanced vision model (best for scientific vision) + - Choose best model for each task + +2. **Azure Integration** + - Azure Document Intelligence for complex PDFs + - Enhanced layout understanding + - Better table extraction + +3. **Batch Processing** + - Parallel conversion with configurable workers + - Recursive directory processing + - Progress tracking and error reporting + - Format-specific organization + +4. **Scientific Workflows** + - Literature conversion with metadata + - Automatic index generation + - Year-based organization + - Citation-friendly output + +## Integration with Scientific Writer + +The skill has been added to the Scientific Writer's skill catalog: + +- **Location**: `.claude/skills/markitdown/` +- **Skill Number**: #5 in Document Manipulation Skills +- **SKILLS.md**: Updated with complete skill description + +### Usage Examples + +``` +> Convert all PDFs in the literature folder to Markdown +> Convert this PowerPoint presentation to Markdown with AI-generated descriptions +> Extract tables from this Excel file +> Transcribe this lecture recording +``` + +## Scripts Usage + +### Batch Convert +```bash +python scripts/batch_convert.py input_dir/ output_dir/ --extensions .pdf .docx --workers 4 +``` + +### AI-Enhanced Convert +```bash +export OPENROUTER_API_KEY="sk-or-v1-..." +python scripts/convert_with_ai.py paper.pdf output.md \ + --model anthropic/claude-sonnet-4.5 \ + --prompt-type scientific +``` + +### Literature Convert +```bash +python scripts/convert_literature.py papers/ markdown/ --organize-by-year --create-index +``` + +## Key Features + +1. **Token-Efficient Output**: Markdown optimized for LLM processing +2. **Comprehensive Format Support**: 15+ file types +3. **AI Enhancement**: Detailed image descriptions via OpenAI +4. **OCR Support**: Extract text from scanned documents +5. **Audio Transcription**: Speech-to-text for audio files +6. **YouTube Support**: Video transcript extraction +7. **Plugin System**: Extensible architecture +8. **Batch Processing**: Efficient parallel conversion +9. **Error Handling**: Robust error management +10. **Scientific Focus**: Optimized for research workflows + +## Installation + +```bash +# Full installation +pip install 'markitdown[all]' + +# Selective installation +pip install 'markitdown[pdf,docx,pptx,xlsx]' +``` + +## Quick Start + +```python +from markitdown import MarkItDown + +# Basic usage +md = MarkItDown() +result = md.convert("document.pdf") +print(result.text_content) + +# With AI via OpenRouter +from openai import OpenAI +client = OpenAI( + api_key="your-openrouter-api-key", + base_url="https://openrouter.ai/api/v1" +) +md = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5" # or openai/gpt-4o +) +result = md.convert("presentation.pptx") +``` + +## Documentation Files + +| File | Purpose | Lines | +|------|---------|-------| +| SKILL.md | Main documentation | 400+ | +| api_reference.md | API documentation | 500+ | +| file_formats.md | Format guides | 600+ | +| example_usage.md | Practical examples | 500+ | +| batch_convert.py | Batch conversion | 200+ | +| convert_with_ai.py | AI conversion | 200+ | +| convert_literature.py | Literature conversion | 250+ | +| QUICK_REFERENCE.md | Quick reference | 300+ | +| INSTALLATION_GUIDE.md | Installation guide | 300+ | + +**Total**: ~3,000+ lines of documentation and code + +## Use Cases + +1. **Literature Review**: Convert research papers to Markdown for analysis +2. **Data Extraction**: Extract tables from Excel/PDF for processing +3. **Presentation Processing**: Convert slides with AI descriptions +4. **Document Analysis**: Prepare documents for LLM consumption +5. **Lecture Transcription**: Convert audio recordings to text +6. **YouTube Analysis**: Extract video transcripts +7. **Archive Processing**: Batch convert document collections + +## Next Steps + +1. Install MarkItDown: `pip install 'markitdown[all]'` +2. Read `QUICK_REFERENCE.md` for common tasks +3. Try example scripts in `scripts/` directory +4. Explore `SKILL.md` for comprehensive guide +5. Check `example_usage.md` for practical examples + +## Resources + +- **MarkItDown GitHub**: https://github.com/microsoft/markitdown +- **PyPI**: https://pypi.org/project/markitdown/ +- **OpenRouter**: https://openrouter.ai (AI model access) +- **OpenRouter API Keys**: https://openrouter.ai/keys +- **OpenRouter Models**: https://openrouter.ai/models +- **License**: MIT (Microsoft Corporation) +- **Python**: 3.10+ required +- **Skill Location**: `.claude/skills/markitdown/` + +## Success Criteria + +✅ Comprehensive skill documentation created +✅ Complete API reference provided +✅ Format-specific guides included +✅ Utility scripts implemented +✅ Practical examples documented +✅ Installation guide created +✅ Quick reference guide added +✅ Integration with Scientific Writer complete +✅ SKILLS.md updated +✅ Scripts made executable +✅ MIT License included + +## Skill Status + +**Status**: ✅ Complete and Ready to Use + +The MarkItDown skill is fully integrated into the Claude Scientific Writer and ready for use. All documentation, scripts, and examples are in place. + diff --git a/assets/example_usage.md b/assets/example_usage.md new file mode 100644 index 0000000..8eef213 --- /dev/null +++ b/assets/example_usage.md @@ -0,0 +1,463 @@ +# MarkItDown Example Usage + +This document provides practical examples of using MarkItDown in various scenarios. + +## Basic Examples + +### 1. Simple File Conversion + +```python +from markitdown import MarkItDown + +md = MarkItDown() + +# Convert a PDF +result = md.convert("research_paper.pdf") +print(result.text_content) + +# Convert a Word document +result = md.convert("manuscript.docx") +print(result.text_content) + +# Convert a PowerPoint +result = md.convert("presentation.pptx") +print(result.text_content) +``` + +### 2. Save to File + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("document.pdf") + +with open("output.md", "w", encoding="utf-8") as f: + f.write(result.text_content) +``` + +### 3. Convert from Stream + +```python +from markitdown import MarkItDown + +md = MarkItDown() + +with open("document.pdf", "rb") as f: + result = md.convert_stream(f, file_extension=".pdf") + print(result.text_content) +``` + +## Scientific Workflows + +### Convert Research Papers + +```python +from markitdown import MarkItDown +from pathlib import Path + +md = MarkItDown() + +# Convert all papers in a directory +papers_dir = Path("research_papers/") +output_dir = Path("markdown_papers/") +output_dir.mkdir(exist_ok=True) + +for paper in papers_dir.glob("*.pdf"): + result = md.convert(str(paper)) + + # Save with original filename + output_file = output_dir / f"{paper.stem}.md" + output_file.write_text(result.text_content) + + print(f"Converted: {paper.name}") +``` + +### Extract Tables from Excel + +```python +from markitdown import MarkItDown + +md = MarkItDown() + +# Convert Excel to Markdown tables +result = md.convert("experimental_data.xlsx") + +# The result contains Markdown-formatted tables +print(result.text_content) + +# Save for further processing +with open("data_tables.md", "w") as f: + f.write(result.text_content) +``` + +### Process Presentation Slides + +```python +from markitdown import MarkItDown +from openai import OpenAI + +# With AI descriptions for images +client = OpenAI() +md = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5", + llm_prompt="Describe this scientific slide, focusing on data and key findings" +) + +result = md.convert("conference_talk.pptx") + +# Save with metadata +output = f"""# Conference Talk + +{result.text_content} +""" + +with open("talk_notes.md", "w") as f: + f.write(output) +``` + +## AI-Enhanced Conversions + +### Detailed Image Descriptions + +```python +from markitdown import MarkItDown +from openai import OpenAI + +# Initialize OpenRouter client +client = OpenAI( + api_key="your-openrouter-api-key", + base_url="https://openrouter.ai/api/v1" +) + +# Scientific diagram analysis +scientific_prompt = """ +Analyze this scientific figure. Describe: +- Type of visualization (graph, microscopy, diagram, etc.) +- Key data points and trends +- Axes, labels, and legends +- Scientific significance +Be technical and precise. +""" + +md = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5", # recommended for scientific vision + llm_prompt=scientific_prompt +) + +# Convert paper with figures +result = md.convert("paper_with_figures.pdf") +print(result.text_content) +``` + +### Different Prompts for Different Files + +```python +from markitdown import MarkItDown +from openai import OpenAI + +# Initialize OpenRouter client +client = OpenAI( + api_key="your-openrouter-api-key", + base_url="https://openrouter.ai/api/v1" +) + +# Scientific papers - use Claude for technical analysis +scientific_md = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5", + llm_prompt="Describe scientific figures with technical precision" +) + +# Presentations - use GPT-4o for visual understanding +presentation_md = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5", + llm_prompt="Summarize slide content and key visual elements" +) + +# Use appropriate instance for each file +paper_result = scientific_md.convert("research.pdf") +slides_result = presentation_md.convert("talk.pptx") +``` + +## Batch Processing + +### Process Multiple Files + +```python +from markitdown import MarkItDown +from pathlib import Path + +md = MarkItDown() + +files_to_convert = [ + "paper1.pdf", + "data.xlsx", + "presentation.pptx", + "notes.docx" +] + +for file in files_to_convert: + try: + result = md.convert(file) + output = Path(file).stem + ".md" + + with open(output, "w") as f: + f.write(result.text_content) + + print(f"✓ {file} -> {output}") + except Exception as e: + print(f"✗ Error converting {file}: {e}") +``` + +### Parallel Processing + +```python +from markitdown import MarkItDown +from pathlib import Path +from concurrent.futures import ThreadPoolExecutor + +def convert_file(filepath): + md = MarkItDown() + result = md.convert(filepath) + + output = Path(filepath).stem + ".md" + with open(output, "w") as f: + f.write(result.text_content) + + return filepath, output + +files = list(Path("documents/").glob("*.pdf")) + +with ThreadPoolExecutor(max_workers=4) as executor: + results = executor.map(convert_file, [str(f) for f in files]) + + for input_file, output_file in results: + print(f"Converted: {input_file} -> {output_file}") +``` + +## Integration Examples + +### Literature Review Pipeline + +```python +from markitdown import MarkItDown +from pathlib import Path +import json + +md = MarkItDown() + +# Convert papers and create metadata +papers_dir = Path("literature/") +output_dir = Path("literature_markdown/") +output_dir.mkdir(exist_ok=True) + +catalog = [] + +for paper in papers_dir.glob("*.pdf"): + result = md.convert(str(paper)) + + # Save Markdown + md_file = output_dir / f"{paper.stem}.md" + md_file.write_text(result.text_content) + + # Store metadata + catalog.append({ + "title": result.title or paper.stem, + "source": paper.name, + "markdown": str(md_file), + "word_count": len(result.text_content.split()) + }) + +# Save catalog +with open(output_dir / "catalog.json", "w") as f: + json.dump(catalog, f, indent=2) +``` + +### Data Extraction Pipeline + +```python +from markitdown import MarkItDown +import re + +md = MarkItDown() + +# Convert Excel data to Markdown +result = md.convert("experimental_results.xlsx") + +# Extract tables (Markdown tables start with |) +tables = [] +current_table = [] +in_table = False + +for line in result.text_content.split('\n'): + if line.strip().startswith('|'): + in_table = True + current_table.append(line) + elif in_table: + if current_table: + tables.append('\n'.join(current_table)) + current_table = [] + in_table = False + +# Process each table +for i, table in enumerate(tables): + print(f"Table {i+1}:") + print(table) + print("\n" + "="*50 + "\n") +``` + +### YouTube Transcript Analysis + +```python +from markitdown import MarkItDown + +md = MarkItDown() + +# Get transcript +video_url = "https://www.youtube.com/watch?v=VIDEO_ID" +result = md.convert(video_url) + +# Save transcript +with open("lecture_transcript.md", "w") as f: + f.write(f"# Lecture Transcript\n\n") + f.write(f"**Source**: {video_url}\n\n") + f.write(result.text_content) +``` + +## Error Handling + +### Robust Conversion + +```python +from markitdown import MarkItDown +from pathlib import Path +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +md = MarkItDown() + +def safe_convert(filepath): + """Convert file with error handling.""" + try: + result = md.convert(filepath) + output = Path(filepath).stem + ".md" + + with open(output, "w") as f: + f.write(result.text_content) + + logger.info(f"Successfully converted {filepath}") + return True + + except FileNotFoundError: + logger.error(f"File not found: {filepath}") + return False + + except ValueError as e: + logger.error(f"Invalid file format for {filepath}: {e}") + return False + + except Exception as e: + logger.error(f"Unexpected error converting {filepath}: {e}") + return False + +# Use it +files = ["paper.pdf", "data.xlsx", "slides.pptx"] +results = [safe_convert(f) for f in files] + +print(f"Successfully converted {sum(results)}/{len(files)} files") +``` + +## Advanced Use Cases + +### Custom Metadata Extraction + +```python +from markitdown import MarkItDown +import re +from datetime import datetime + +md = MarkItDown() + +def convert_with_metadata(filepath): + result = md.convert(filepath) + + # Extract metadata from content + metadata = { + "file": filepath, + "title": result.title, + "converted_at": datetime.now().isoformat(), + "word_count": len(result.text_content.split()), + "char_count": len(result.text_content) + } + + # Try to find author + author_match = re.search(r'(?:Author|By):\s*(.+?)(?:\n|$)', result.text_content) + if author_match: + metadata["author"] = author_match.group(1).strip() + + # Create formatted output + output = f"""--- +title: {metadata['title']} +author: {metadata.get('author', 'Unknown')} +source: {metadata['file']} +converted: {metadata['converted_at']} +words: {metadata['word_count']} +--- + +{result.text_content} +""" + + return output, metadata + +# Use it +content, meta = convert_with_metadata("paper.pdf") +print(meta) +``` + +### Format-Specific Processing + +```python +from markitdown import MarkItDown +from pathlib import Path + +md = MarkItDown() + +def process_by_format(filepath): + path = Path(filepath) + result = md.convert(filepath) + + if path.suffix == '.pdf': + # Add PDF-specific metadata + output = f"# PDF Document: {path.stem}\n\n" + output += result.text_content + + elif path.suffix == '.xlsx': + # Add table count + table_count = result.text_content.count('|---') + output = f"# Excel Data: {path.stem}\n\n" + output += f"**Tables**: {table_count}\n\n" + output += result.text_content + + elif path.suffix == '.pptx': + # Add slide count + slide_count = result.text_content.count('## Slide') + output = f"# Presentation: {path.stem}\n\n" + output += f"**Slides**: {slide_count}\n\n" + output += result.text_content + + else: + output = result.text_content + + return output + +# Use it +content = process_by_format("presentation.pptx") +print(content) +``` + diff --git a/references/api_reference.md b/references/api_reference.md new file mode 100644 index 0000000..90ac446 --- /dev/null +++ b/references/api_reference.md @@ -0,0 +1,399 @@ +# MarkItDown API Reference + +## Core Classes + +### MarkItDown + +The main class for converting files to Markdown. + +```python +from markitdown import MarkItDown + +md = MarkItDown( + llm_client=None, + llm_model=None, + llm_prompt=None, + docintel_endpoint=None, + enable_plugins=False +) +``` + +#### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `llm_client` | OpenAI client | `None` | OpenAI-compatible client for AI image descriptions | +| `llm_model` | str | `None` | Model name (e.g., "anthropic/claude-sonnet-4.5") for image descriptions | +| `llm_prompt` | str | `None` | Custom prompt for image description | +| `docintel_endpoint` | str | `None` | Azure Document Intelligence endpoint | +| `enable_plugins` | bool | `False` | Enable 3rd-party plugins | + +#### Methods + +##### convert() + +Convert a file to Markdown. + +```python +result = md.convert( + source, + file_extension=None +) +``` + +**Parameters**: +- `source` (str): Path to the file to convert +- `file_extension` (str, optional): Override file extension detection + +**Returns**: `DocumentConverterResult` object + +**Example**: +```python +result = md.convert("document.pdf") +print(result.text_content) +``` + +##### convert_stream() + +Convert from a file-like binary stream. + +```python +result = md.convert_stream( + stream, + file_extension +) +``` + +**Parameters**: +- `stream` (BinaryIO): Binary file-like object (e.g., file opened in `"rb"` mode) +- `file_extension` (str): File extension to determine conversion method (e.g., ".pdf") + +**Returns**: `DocumentConverterResult` object + +**Example**: +```python +with open("document.pdf", "rb") as f: + result = md.convert_stream(f, file_extension=".pdf") + print(result.text_content) +``` + +**Important**: The stream must be opened in binary mode (`"rb"`), not text mode. + +## Result Object + +### DocumentConverterResult + +The result of a conversion operation. + +#### Attributes + +| Attribute | Type | Description | +|-----------|------|-------------| +| `text_content` | str | The converted Markdown text | +| `title` | str | Document title (if available) | + +#### Example + +```python +result = md.convert("paper.pdf") + +# Access content +content = result.text_content + +# Access title (if available) +title = result.title +``` + +## Custom Converters + +You can create custom document converters by implementing the `DocumentConverter` interface. + +### DocumentConverter Interface + +```python +from markitdown import DocumentConverter + +class CustomConverter(DocumentConverter): + def convert(self, stream, file_extension): + """ + Convert a document from a binary stream. + + Parameters: + stream (BinaryIO): Binary file-like object + file_extension (str): File extension (e.g., ".custom") + + Returns: + DocumentConverterResult: Conversion result + """ + # Your conversion logic here + pass +``` + +### Registering Custom Converters + +```python +from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult + +class MyCustomConverter(DocumentConverter): + def convert(self, stream, file_extension): + content = stream.read().decode('utf-8') + markdown_text = f"# Custom Format\n\n{content}" + return DocumentConverterResult( + text_content=markdown_text, + title="Custom Document" + ) + +# Create MarkItDown instance +md = MarkItDown() + +# Register custom converter for .custom files +md.register_converter(".custom", MyCustomConverter()) + +# Use it +result = md.convert("myfile.custom") +``` + +## Plugin System + +### Finding Plugins + +Search GitHub for `#markitdown-plugin` tag. + +### Using Plugins + +```python +from markitdown import MarkItDown + +# Enable plugins +md = MarkItDown(enable_plugins=True) +result = md.convert("document.pdf") +``` + +### Creating Plugins + +Plugins are Python packages that register converters with MarkItDown. + +**Plugin Structure**: +``` +my-markitdown-plugin/ +├── setup.py +├── my_plugin/ +│ ├── __init__.py +│ └── converter.py +└── README.md +``` + +**setup.py**: +```python +from setuptools import setup + +setup( + name="markitdown-my-plugin", + version="0.1.0", + packages=["my_plugin"], + entry_points={ + "markitdown.plugins": [ + "my_plugin = my_plugin.converter:MyConverter", + ], + }, +) +``` + +**converter.py**: +```python +from markitdown import DocumentConverter, DocumentConverterResult + +class MyConverter(DocumentConverter): + def convert(self, stream, file_extension): + # Your conversion logic + content = stream.read() + markdown = self.process(content) + return DocumentConverterResult( + text_content=markdown, + title="My Document" + ) + + def process(self, content): + # Process content + return "# Converted Content\n\n..." +``` + +## AI-Enhanced Conversions + +### Using OpenRouter for Image Descriptions + +```python +from markitdown import MarkItDown +from openai import OpenAI + +# Initialize OpenRouter client (OpenAI-compatible API) +client = OpenAI( + api_key="your-openrouter-api-key", + base_url="https://openrouter.ai/api/v1" +) + +# Create MarkItDown with AI support +md = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5", # recommended for scientific vision + llm_prompt="Describe this image in detail for scientific documentation" +) + +# Convert files with images +result = md.convert("presentation.pptx") +``` + +### Available Models via OpenRouter + +Popular models with vision support: +- `anthropic/claude-sonnet-4.5` - **Recommended for scientific vision** +- `anthropic/claude-opus-4.5` - Advanced vision model +- `openai/gpt-4o` - GPT-4 Omni +- `openai/gpt-4-vision` - GPT-4 Vision +- `google/gemini-pro-vision` - Gemini Pro Vision + +See https://openrouter.ai/models for the complete list. + +### Custom Prompts + +```python +# For scientific diagrams +scientific_prompt = """ +Analyze this scientific diagram or chart. Describe: +1. The type of visualization (graph, chart, diagram, etc.) +2. Key data points or trends +3. Labels and axes +4. Scientific significance +Be precise and technical. +""" + +md = MarkItDown( + llm_client=client, + llm_model="anthropic/claude-sonnet-4.5", + llm_prompt=scientific_prompt +) +``` + +## Azure Document Intelligence + +### Setup + +1. Create Azure Document Intelligence resource +2. Get endpoint URL +3. Set authentication + +### Usage + +```python +from markitdown import MarkItDown + +md = MarkItDown( + docintel_endpoint="https://YOUR-RESOURCE.cognitiveservices.azure.com/" +) + +result = md.convert("complex_document.pdf") +``` + +### Authentication + +Set environment variables: +```bash +export AZURE_DOCUMENT_INTELLIGENCE_KEY="your-key" +``` + +Or pass credentials programmatically. + +## Error Handling + +```python +from markitdown import MarkItDown + +md = MarkItDown() + +try: + result = md.convert("document.pdf") + print(result.text_content) +except FileNotFoundError: + print("File not found") +except ValueError as e: + print(f"Invalid file format: {e}") +except Exception as e: + print(f"Conversion error: {e}") +``` + +## Performance Tips + +### 1. Reuse MarkItDown Instance + +```python +# Good: Create once, use many times +md = MarkItDown() + +for file in files: + result = md.convert(file) + process(result) +``` + +### 2. Use Streaming for Large Files + +```python +# For large files +with open("large_file.pdf", "rb") as f: + result = md.convert_stream(f, file_extension=".pdf") +``` + +### 3. Batch Processing + +```python +from concurrent.futures import ThreadPoolExecutor + +md = MarkItDown() + +def convert_file(filepath): + return md.convert(filepath) + +with ThreadPoolExecutor(max_workers=4) as executor: + results = executor.map(convert_file, file_list) +``` + +## Breaking Changes (v0.0.1 to v0.1.0) + +1. **Dependencies**: Now organized into optional feature groups + ```bash + # Old + pip install markitdown + + # New + pip install 'markitdown[all]' + ``` + +2. **convert_stream()**: Now requires binary file-like object + ```python + # Old (also accepted text) + with open("file.pdf", "r") as f: # text mode + result = md.convert_stream(f) + + # New (binary only) + with open("file.pdf", "rb") as f: # binary mode + result = md.convert_stream(f, file_extension=".pdf") + ``` + +3. **DocumentConverter Interface**: Changed to read from streams instead of file paths + - No temporary files created + - More memory efficient + - Plugins need updating + +## Version Compatibility + +- **Python**: 3.10 or higher required +- **Dependencies**: Check `setup.py` for version constraints +- **OpenAI**: Compatible with OpenAI Python SDK v1.0+ + +## Environment Variables + +| Variable | Description | Example | +|----------|-------------|---------| +| `OPENROUTER_API_KEY` | OpenRouter API key for image descriptions | `sk-or-v1-...` | +| `AZURE_DOCUMENT_INTELLIGENCE_KEY` | Azure DI authentication | `key123...` | +| `AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT` | Azure DI endpoint | `https://...` | + diff --git a/references/file_formats.md b/references/file_formats.md new file mode 100644 index 0000000..9cc27bb --- /dev/null +++ b/references/file_formats.md @@ -0,0 +1,542 @@ +# File Format Support + +This document provides detailed information about each file format supported by MarkItDown. + +## Document Formats + +### PDF (.pdf) + +**Capabilities**: +- Text extraction +- Table detection +- Metadata extraction +- OCR for scanned documents (with dependencies) + +**Dependencies**: +```bash +pip install 'markitdown[pdf]' +``` + +**Best For**: +- Scientific papers +- Reports +- Books +- Forms + +**Limitations**: +- Complex layouts may not preserve perfect formatting +- Scanned PDFs require OCR setup +- Some PDF features (annotations, forms) may not convert + +**Example**: +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("research_paper.pdf") +print(result.text_content) +``` + +**Enhanced with Azure Document Intelligence**: +```python +md = MarkItDown(docintel_endpoint="https://YOUR-ENDPOINT.cognitiveservices.azure.com/") +result = md.convert("complex_layout.pdf") +``` + +--- + +### Microsoft Word (.docx) + +**Capabilities**: +- Text extraction +- Table conversion +- Heading hierarchy +- List formatting +- Basic text formatting (bold, italic) + +**Dependencies**: +```bash +pip install 'markitdown[docx]' +``` + +**Best For**: +- Research papers +- Reports +- Documentation +- Manuscripts + +**Preserved Elements**: +- Headings (converted to Markdown headers) +- Tables (converted to Markdown tables) +- Lists (bulleted and numbered) +- Basic formatting (bold, italic) +- Paragraphs + +**Example**: +```python +result = md.convert("manuscript.docx") +``` + +--- + +### PowerPoint (.pptx) + +**Capabilities**: +- Slide content extraction +- Speaker notes +- Table extraction +- Image descriptions (with AI) + +**Dependencies**: +```bash +pip install 'markitdown[pptx]' +``` + +**Best For**: +- Presentations +- Lecture slides +- Conference talks + +**Output Format**: +```markdown +# Slide 1: Title + +Content from slide 1... + +**Notes**: Speaker notes appear here + +--- + +# Slide 2: Next Topic + +... +``` + +**With AI Image Descriptions**: +```python +from openai import OpenAI + +client = OpenAI() +md = MarkItDown(llm_client=client, llm_model="gpt-4o") +result = md.convert("presentation.pptx") +``` + +--- + +### Excel (.xlsx, .xls) + +**Capabilities**: +- Sheet extraction +- Table formatting +- Data preservation +- Formula values (calculated) + +**Dependencies**: +```bash +pip install 'markitdown[xlsx]' # Modern Excel +pip install 'markitdown[xls]' # Legacy Excel +``` + +**Best For**: +- Data tables +- Research data +- Statistical results +- Experimental data + +**Output Format**: +```markdown +# Sheet: Results + +| Sample | Control | Treatment | P-value | +|--------|---------|-----------|---------| +| 1 | 10.2 | 12.5 | 0.023 | +| 2 | 9.8 | 11.9 | 0.031 | +``` + +**Example**: +```python +result = md.convert("experimental_data.xlsx") +``` + +--- + +## Image Formats + +### Images (.jpg, .jpeg, .png, .gif, .webp) + +**Capabilities**: +- EXIF metadata extraction +- OCR text extraction +- AI-powered image descriptions + +**Dependencies**: +```bash +pip install 'markitdown[all]' # Includes image support +``` + +**Best For**: +- Scanned documents +- Charts and graphs +- Scientific diagrams +- Photographs with text + +**Output Without AI**: +```markdown +![Image](image.jpg) + +**EXIF Data**: +- Camera: Canon EOS 5D +- Date: 2024-01-15 +- Resolution: 4000x3000 +``` + +**Output With AI**: +```python +from openai import OpenAI + +client = OpenAI() +md = MarkItDown( + llm_client=client, + llm_model="gpt-4o", + llm_prompt="Describe this scientific diagram in detail" +) +result = md.convert("graph.png") +``` + +**OCR for Text Extraction**: +Requires Tesseract OCR: +```bash +# macOS +brew install tesseract + +# Ubuntu +sudo apt-get install tesseract-ocr +``` + +--- + +## Audio Formats + +### Audio (.wav, .mp3) + +**Capabilities**: +- Metadata extraction +- Speech-to-text transcription +- Duration and technical info + +**Dependencies**: +```bash +pip install 'markitdown[audio-transcription]' +``` + +**Best For**: +- Lecture recordings +- Interviews +- Podcasts +- Meeting recordings + +**Output Format**: +```markdown +# Audio: interview.mp3 + +**Metadata**: +- Duration: 45:32 +- Bitrate: 320kbps +- Sample Rate: 44100Hz + +**Transcription**: +[Transcribed text appears here...] +``` + +**Example**: +```python +result = md.convert("lecture.mp3") +``` + +--- + +## Web Formats + +### HTML (.html, .htm) + +**Capabilities**: +- Clean HTML to Markdown conversion +- Link preservation +- Table conversion +- List formatting + +**Best For**: +- Web pages +- Documentation +- Blog posts +- Online articles + +**Output Format**: Clean Markdown with preserved links and structure + +**Example**: +```python +result = md.convert("webpage.html") +``` + +--- + +### YouTube URLs + +**Capabilities**: +- Fetch video transcriptions +- Extract video metadata +- Caption download + +**Dependencies**: +```bash +pip install 'markitdown[youtube-transcription]' +``` + +**Best For**: +- Educational videos +- Lectures +- Talks +- Tutorials + +**Example**: +```python +result = md.convert("https://www.youtube.com/watch?v=VIDEO_ID") +``` + +--- + +## Data Formats + +### CSV (.csv) + +**Capabilities**: +- Automatic table conversion +- Delimiter detection +- Header preservation + +**Output Format**: Markdown tables + +**Example**: +```python +result = md.convert("data.csv") +``` + +**Output**: +```markdown +| Column1 | Column2 | Column3 | +|---------|---------|---------| +| Value1 | Value2 | Value3 | +``` + +--- + +### JSON (.json) + +**Capabilities**: +- Structured representation +- Pretty formatting +- Nested data visualization + +**Best For**: +- API responses +- Configuration files +- Data exports + +**Example**: +```python +result = md.convert("data.json") +``` + +--- + +### XML (.xml) + +**Capabilities**: +- Structure preservation +- Attribute extraction +- Formatted output + +**Best For**: +- Configuration files +- Data interchange +- Structured documents + +**Example**: +```python +result = md.convert("config.xml") +``` + +--- + +## Archive Formats + +### ZIP (.zip) + +**Capabilities**: +- Iterates through archive contents +- Converts each file individually +- Maintains directory structure in output + +**Best For**: +- Document collections +- Project archives +- Batch conversions + +**Output Format**: +```markdown +# Archive: documents.zip + +## File: document1.pdf +[Content from document1.pdf...] + +--- + +## File: document2.docx +[Content from document2.docx...] +``` + +**Example**: +```python +result = md.convert("archive.zip") +``` + +--- + +## E-book Formats + +### EPUB (.epub) + +**Capabilities**: +- Full text extraction +- Chapter structure +- Metadata extraction + +**Best For**: +- E-books +- Digital publications +- Long-form content + +**Output Format**: Markdown with preserved chapter structure + +**Example**: +```python +result = md.convert("book.epub") +``` + +--- + +## Other Formats + +### Outlook Messages (.msg) + +**Capabilities**: +- Email content extraction +- Attachment listing +- Metadata (from, to, subject, date) + +**Dependencies**: +```bash +pip install 'markitdown[outlook]' +``` + +**Best For**: +- Email archives +- Communication records + +**Example**: +```python +result = md.convert("message.msg") +``` + +--- + +## Format-Specific Tips + +### PDF Best Practices + +1. **Use Azure Document Intelligence for complex layouts**: + ```python + md = MarkItDown(docintel_endpoint="endpoint_url") + ``` + +2. **For scanned PDFs, ensure OCR is set up**: + ```bash + brew install tesseract # macOS + ``` + +3. **Split very large PDFs before conversion** for better performance + +### PowerPoint Best Practices + +1. **Use AI for visual content**: + ```python + md = MarkItDown(llm_client=client, llm_model="gpt-4o") + ``` + +2. **Check speaker notes** - they're included in output + +3. **Complex animations won't be captured** - static content only + +### Excel Best Practices + +1. **Large spreadsheets** may take time to convert + +2. **Formulas are converted to their calculated values** + +3. **Multiple sheets** are all included in output + +4. **Charts become text descriptions** (use AI for better descriptions) + +### Image Best Practices + +1. **Use AI for meaningful descriptions**: + ```python + md = MarkItDown( + llm_client=client, + llm_model="gpt-4o", + llm_prompt="Describe this scientific figure in detail" + ) + ``` + +2. **For text-heavy images, ensure OCR dependencies** are installed + +3. **High-resolution images** may take longer to process + +### Audio Best Practices + +1. **Clear audio** produces better transcriptions + +2. **Long recordings** may take significant time + +3. **Consider splitting long audio files** for faster processing + +--- + +## Unsupported Formats + +If you need to convert an unsupported format: + +1. **Create a custom converter** (see `api_reference.md`) +2. **Look for plugins** on GitHub (#markitdown-plugin) +3. **Pre-convert to supported format** (e.g., convert .rtf to .docx) + +--- + +## Format Detection + +MarkItDown automatically detects format from: + +1. **File extension** (primary method) +2. **MIME type** (fallback) +3. **File signature** (magic bytes, fallback) + +**Override detection**: +```python +# Force specific format +result = md.convert("file_without_extension", file_extension=".pdf") + +# With streams +with open("file", "rb") as f: + result = md.convert_stream(f, file_extension=".pdf") +``` + diff --git a/scripts/batch_convert.py b/scripts/batch_convert.py new file mode 100755 index 0000000..e763210 --- /dev/null +++ b/scripts/batch_convert.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +""" +Batch convert multiple files to Markdown using MarkItDown. + +This script demonstrates how to efficiently convert multiple files +in a directory to Markdown format. +""" + +import argparse +from pathlib import Path +from typing import List, Optional +from markitdown import MarkItDown +from concurrent.futures import ThreadPoolExecutor, as_completed +import sys + + +def convert_file(md: MarkItDown, file_path: Path, output_dir: Path, verbose: bool = False) -> tuple[bool, str, str]: + """ + Convert a single file to Markdown. + + Args: + md: MarkItDown instance + file_path: Path to input file + output_dir: Directory for output files + verbose: Print detailed messages + + Returns: + Tuple of (success, input_path, message) + """ + try: + if verbose: + print(f"Converting: {file_path}") + + result = md.convert(str(file_path)) + + # Create output path + output_file = output_dir / f"{file_path.stem}.md" + + # Write content with metadata header + content = f"# {result.title or file_path.stem}\n\n" + content += f"**Source**: {file_path.name}\n" + content += f"**Format**: {file_path.suffix}\n\n" + content += "---\n\n" + content += result.text_content + + output_file.write_text(content, encoding='utf-8') + + return True, str(file_path), f"✓ Converted to {output_file.name}" + + except Exception as e: + return False, str(file_path), f"✗ Error: {str(e)}" + + +def batch_convert( + input_dir: Path, + output_dir: Path, + extensions: Optional[List[str]] = None, + recursive: bool = False, + workers: int = 4, + verbose: bool = False, + enable_plugins: bool = False +) -> dict: + """ + Batch convert files in a directory. + + Args: + input_dir: Input directory + output_dir: Output directory + extensions: List of file extensions to convert (e.g., ['.pdf', '.docx']) + recursive: Search subdirectories + workers: Number of parallel workers + verbose: Print detailed messages + enable_plugins: Enable MarkItDown plugins + + Returns: + Dictionary with conversion statistics + """ + # Create output directory + output_dir.mkdir(parents=True, exist_ok=True) + + # Default extensions if not specified + if extensions is None: + extensions = ['.pdf', '.docx', '.pptx', '.xlsx', '.html', '.jpg', '.png'] + + # Find files + files = [] + if recursive: + for ext in extensions: + files.extend(input_dir.rglob(f"*{ext}")) + else: + for ext in extensions: + files.extend(input_dir.glob(f"*{ext}")) + + if not files: + print(f"No files found with extensions: {', '.join(extensions)}") + return {'total': 0, 'success': 0, 'failed': 0} + + print(f"Found {len(files)} file(s) to convert") + + # Create MarkItDown instance + md = MarkItDown(enable_plugins=enable_plugins) + + # Convert files in parallel + results = { + 'total': len(files), + 'success': 0, + 'failed': 0, + 'details': [] + } + + with ThreadPoolExecutor(max_workers=workers) as executor: + futures = { + executor.submit(convert_file, md, file_path, output_dir, verbose): file_path + for file_path in files + } + + for future in as_completed(futures): + success, path, message = future.result() + + if success: + results['success'] += 1 + else: + results['failed'] += 1 + + results['details'].append({ + 'file': path, + 'success': success, + 'message': message + }) + + print(message) + + return results + + +def main(): + parser = argparse.ArgumentParser( + description="Batch convert files to Markdown using MarkItDown", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Convert all PDFs in a directory + python batch_convert.py papers/ output/ --extensions .pdf + + # Convert multiple formats recursively + python batch_convert.py documents/ markdown/ --extensions .pdf .docx .pptx -r + + # Use 8 parallel workers + python batch_convert.py input/ output/ --workers 8 + + # Enable plugins + python batch_convert.py input/ output/ --plugins + """ + ) + + parser.add_argument('input_dir', type=Path, help='Input directory') + parser.add_argument('output_dir', type=Path, help='Output directory') + parser.add_argument( + '--extensions', '-e', + nargs='+', + help='File extensions to convert (e.g., .pdf .docx)' + ) + parser.add_argument( + '--recursive', '-r', + action='store_true', + help='Search subdirectories recursively' + ) + parser.add_argument( + '--workers', '-w', + type=int, + default=4, + help='Number of parallel workers (default: 4)' + ) + parser.add_argument( + '--verbose', '-v', + action='store_true', + help='Verbose output' + ) + parser.add_argument( + '--plugins', '-p', + action='store_true', + help='Enable MarkItDown plugins' + ) + + args = parser.parse_args() + + # Validate input directory + if not args.input_dir.exists(): + print(f"Error: Input directory '{args.input_dir}' does not exist") + sys.exit(1) + + if not args.input_dir.is_dir(): + print(f"Error: '{args.input_dir}' is not a directory") + sys.exit(1) + + # Run batch conversion + results = batch_convert( + input_dir=args.input_dir, + output_dir=args.output_dir, + extensions=args.extensions, + recursive=args.recursive, + workers=args.workers, + verbose=args.verbose, + enable_plugins=args.plugins + ) + + # Print summary + print("\n" + "="*50) + print("CONVERSION SUMMARY") + print("="*50) + print(f"Total files: {results['total']}") + print(f"Successful: {results['success']}") + print(f"Failed: {results['failed']}") + print(f"Success rate: {results['success']/results['total']*100:.1f}%" if results['total'] > 0 else "N/A") + + # Show failed files if any + if results['failed'] > 0: + print("\nFailed conversions:") + for detail in results['details']: + if not detail['success']: + print(f" - {detail['file']}: {detail['message']}") + + sys.exit(0 if results['failed'] == 0 else 1) + + +if __name__ == '__main__': + main() + diff --git a/scripts/convert_literature.py b/scripts/convert_literature.py new file mode 100755 index 0000000..c45f350 --- /dev/null +++ b/scripts/convert_literature.py @@ -0,0 +1,283 @@ +#!/usr/bin/env python3 +""" +Convert scientific literature PDFs to Markdown for analysis and review. + +This script is specifically designed for converting academic papers, +organizing them, and preparing them for literature review workflows. +""" + +import argparse +import json +import re +import sys +from pathlib import Path +from typing import List, Dict, Optional +from markitdown import MarkItDown +from datetime import datetime + + +def extract_metadata_from_filename(filename: str) -> Dict[str, str]: + """ + Try to extract metadata from filename. + Supports patterns like: Author_Year_Title.pdf + """ + metadata = {} + + # Remove extension + name = Path(filename).stem + + # Try to extract year + year_match = re.search(r'\b(19|20)\d{2}\b', name) + if year_match: + metadata['year'] = year_match.group() + + # Split by underscores or dashes + parts = re.split(r'[_\-]', name) + if len(parts) >= 2: + metadata['author'] = parts[0].replace('_', ' ') + metadata['title'] = ' '.join(parts[1:]).replace('_', ' ') + else: + metadata['title'] = name.replace('_', ' ') + + return metadata + + +def convert_paper( + md: MarkItDown, + input_file: Path, + output_dir: Path, + organize_by_year: bool = False +) -> tuple[bool, Dict]: + """ + Convert a single paper to Markdown with metadata extraction. + + Args: + md: MarkItDown instance + input_file: Path to PDF file + output_dir: Output directory + organize_by_year: Organize into year subdirectories + + Returns: + Tuple of (success, metadata_dict) + """ + try: + print(f"Converting: {input_file.name}") + + # Convert to Markdown + result = md.convert(str(input_file)) + + # Extract metadata from filename + metadata = extract_metadata_from_filename(input_file.name) + metadata['source_file'] = input_file.name + metadata['converted_date'] = datetime.now().isoformat() + + # Try to extract title from content if not in filename + if 'title' not in metadata and result.title: + metadata['title'] = result.title + + # Create output path + if organize_by_year and 'year' in metadata: + output_subdir = output_dir / metadata['year'] + output_subdir.mkdir(parents=True, exist_ok=True) + else: + output_subdir = output_dir + output_subdir.mkdir(parents=True, exist_ok=True) + + output_file = output_subdir / f"{input_file.stem}.md" + + # Create formatted Markdown with front matter + content = "---\n" + content += f"title: \"{metadata.get('title', input_file.stem)}\"\n" + if 'author' in metadata: + content += f"author: \"{metadata['author']}\"\n" + if 'year' in metadata: + content += f"year: {metadata['year']}\n" + content += f"source: \"{metadata['source_file']}\"\n" + content += f"converted: \"{metadata['converted_date']}\"\n" + content += "---\n\n" + + # Add title + content += f"# {metadata.get('title', input_file.stem)}\n\n" + + # Add metadata section + content += "## Document Information\n\n" + if 'author' in metadata: + content += f"**Author**: {metadata['author']}\n" + if 'year' in metadata: + content += f"**Year**: {metadata['year']}\n" + content += f"**Source File**: {metadata['source_file']}\n" + content += f"**Converted**: {metadata['converted_date']}\n\n" + content += "---\n\n" + + # Add content + content += result.text_content + + # Write to file + output_file.write_text(content, encoding='utf-8') + + print(f"✓ Saved to: {output_file}") + + return True, metadata + + except Exception as e: + print(f"✗ Error converting {input_file.name}: {str(e)}") + return False, {'source_file': input_file.name, 'error': str(e)} + + +def create_index(papers: List[Dict], output_dir: Path): + """Create an index/catalog of all converted papers.""" + + # Sort by year (if available) and title + papers_sorted = sorted( + papers, + key=lambda x: (x.get('year', '9999'), x.get('title', '')) + ) + + # Create Markdown index + index_content = "# Literature Review Index\n\n" + index_content += f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n" + index_content += f"**Total Papers**: {len(papers)}\n\n" + index_content += "---\n\n" + + # Group by year + by_year = {} + for paper in papers_sorted: + year = paper.get('year', 'Unknown') + if year not in by_year: + by_year[year] = [] + by_year[year].append(paper) + + # Write by year + for year in sorted(by_year.keys()): + index_content += f"## {year}\n\n" + for paper in by_year[year]: + title = paper.get('title', paper.get('source_file', 'Unknown')) + author = paper.get('author', 'Unknown Author') + source = paper.get('source_file', '') + + # Create link to markdown file + md_file = Path(source).stem + ".md" + if 'year' in paper and paper['year'] != 'Unknown': + md_file = f"{paper['year']}/{md_file}" + + index_content += f"- **{title}**\n" + index_content += f" - Author: {author}\n" + index_content += f" - Source: {source}\n" + index_content += f" - [Read Markdown]({md_file})\n\n" + + # Write index + index_file = output_dir / "INDEX.md" + index_file.write_text(index_content, encoding='utf-8') + print(f"\n✓ Created index: {index_file}") + + # Also create JSON catalog + catalog_file = output_dir / "catalog.json" + with open(catalog_file, 'w', encoding='utf-8') as f: + json.dump(papers_sorted, f, indent=2, ensure_ascii=False) + print(f"✓ Created catalog: {catalog_file}") + + +def main(): + parser = argparse.ArgumentParser( + description="Convert scientific literature PDFs to Markdown", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Convert all PDFs in a directory + python convert_literature.py papers/ output/ + + # Organize by year + python convert_literature.py papers/ output/ --organize-by-year + + # Create index of all papers + python convert_literature.py papers/ output/ --create-index + +Filename Conventions: + For best results, name your PDFs using this pattern: + Author_Year_Title.pdf + + Examples: + Smith_2023_Machine_Learning_Applications.pdf + Jones_2022_Climate_Change_Analysis.pdf + """ + ) + + parser.add_argument('input_dir', type=Path, help='Directory with PDF files') + parser.add_argument('output_dir', type=Path, help='Output directory for Markdown files') + parser.add_argument( + '--organize-by-year', '-y', + action='store_true', + help='Organize output into year subdirectories' + ) + parser.add_argument( + '--create-index', '-i', + action='store_true', + help='Create an index/catalog of all papers' + ) + parser.add_argument( + '--recursive', '-r', + action='store_true', + help='Search subdirectories recursively' + ) + + args = parser.parse_args() + + # Validate input + if not args.input_dir.exists(): + print(f"Error: Input directory '{args.input_dir}' does not exist") + sys.exit(1) + + if not args.input_dir.is_dir(): + print(f"Error: '{args.input_dir}' is not a directory") + sys.exit(1) + + # Find PDF files + if args.recursive: + pdf_files = list(args.input_dir.rglob("*.pdf")) + else: + pdf_files = list(args.input_dir.glob("*.pdf")) + + if not pdf_files: + print("No PDF files found") + sys.exit(1) + + print(f"Found {len(pdf_files)} PDF file(s)") + + # Create MarkItDown instance + md = MarkItDown() + + # Convert all papers + results = [] + success_count = 0 + + for pdf_file in pdf_files: + success, metadata = convert_paper( + md, + pdf_file, + args.output_dir, + args.organize_by_year + ) + + if success: + success_count += 1 + results.append(metadata) + + # Create index if requested + if args.create_index and results: + create_index(results, args.output_dir) + + # Print summary + print("\n" + "="*50) + print("CONVERSION SUMMARY") + print("="*50) + print(f"Total papers: {len(pdf_files)}") + print(f"Successful: {success_count}") + print(f"Failed: {len(pdf_files) - success_count}") + print(f"Success rate: {success_count/len(pdf_files)*100:.1f}%") + + sys.exit(0 if success_count == len(pdf_files) else 1) + + +if __name__ == '__main__': + main() + diff --git a/scripts/convert_with_ai.py b/scripts/convert_with_ai.py new file mode 100755 index 0000000..47dcebc --- /dev/null +++ b/scripts/convert_with_ai.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 +""" +Convert documents to Markdown with AI-enhanced image descriptions. + +This script demonstrates how to use MarkItDown with OpenRouter to generate +detailed descriptions of images in documents (PowerPoint, PDFs with images, etc.) +""" + +import argparse +import os +import sys +from pathlib import Path +from markitdown import MarkItDown +from openai import OpenAI + + +# Predefined prompts for different use cases +PROMPTS = { + 'scientific': """ +Analyze this scientific image or diagram. Provide: +1. Type of visualization (graph, chart, microscopy, diagram, etc.) +2. Key data points, trends, or patterns +3. Axes labels, legends, and scales +4. Notable features or findings +5. Scientific context and significance +Be precise, technical, and detailed. + """.strip(), + + 'presentation': """ +Describe this presentation slide image. Include: +1. Main visual elements and their arrangement +2. Key points or messages conveyed +3. Data or information presented +4. Visual hierarchy and emphasis +Keep the description clear and informative. + """.strip(), + + 'general': """ +Describe this image in detail. Include: +1. Main subjects and objects +2. Visual composition and layout +3. Text content (if any) +4. Notable details +5. Overall context and purpose +Be comprehensive and accurate. + """.strip(), + + 'data_viz': """ +Analyze this data visualization. Provide: +1. Type of chart/graph (bar, line, scatter, pie, etc.) +2. Variables and axes +3. Data ranges and scales +4. Key patterns, trends, or outliers +5. Statistical insights +Focus on quantitative accuracy. + """.strip(), + + 'medical': """ +Describe this medical image. Include: +1. Type of medical imaging (X-ray, MRI, CT, microscopy, etc.) +2. Anatomical structures visible +3. Notable findings or abnormalities +4. Image quality and contrast +5. Clinical relevance +Be professional and precise. + """.strip() +} + + +def convert_with_ai( + input_file: Path, + output_file: Path, + api_key: str, + model: str = "anthropic/claude-sonnet-4.5", + prompt_type: str = "general", + custom_prompt: str = None +) -> bool: + """ + Convert a file to Markdown with AI image descriptions. + + Args: + input_file: Path to input file + output_file: Path to output Markdown file + api_key: OpenRouter API key + model: Model name (default: anthropic/claude-sonnet-4.5) + prompt_type: Type of prompt to use + custom_prompt: Custom prompt (overrides prompt_type) + + Returns: + True if successful, False otherwise + """ + try: + # Initialize OpenRouter client (OpenAI-compatible) + client = OpenAI( + api_key=api_key, + base_url="https://openrouter.ai/api/v1" + ) + + # Select prompt + if custom_prompt: + prompt = custom_prompt + else: + prompt = PROMPTS.get(prompt_type, PROMPTS['general']) + + print(f"Using model: {model}") + print(f"Prompt type: {prompt_type if not custom_prompt else 'custom'}") + print(f"Converting: {input_file}") + + # Create MarkItDown with AI support + md = MarkItDown( + llm_client=client, + llm_model=model, + llm_prompt=prompt + ) + + # Convert file + result = md.convert(str(input_file)) + + # Create output with metadata + content = f"# {result.title or input_file.stem}\n\n" + content += f"**Source**: {input_file.name}\n" + content += f"**Format**: {input_file.suffix}\n" + content += f"**AI Model**: {model}\n" + content += f"**Prompt Type**: {prompt_type if not custom_prompt else 'custom'}\n\n" + content += "---\n\n" + content += result.text_content + + # Write output + output_file.parent.mkdir(parents=True, exist_ok=True) + output_file.write_text(content, encoding='utf-8') + + print(f"✓ Successfully converted to: {output_file}") + return True + + except Exception as e: + print(f"✗ Error: {str(e)}", file=sys.stderr) + return False + + +def main(): + parser = argparse.ArgumentParser( + description="Convert documents to Markdown with AI-enhanced image descriptions", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=f""" +Available prompt types: + scientific - For scientific diagrams, graphs, and charts + presentation - For presentation slides + general - General-purpose image description + data_viz - For data visualizations and charts + medical - For medical imaging + +Examples: + # Convert a scientific paper + python convert_with_ai.py paper.pdf output.md --prompt-type scientific + + # Convert a presentation with custom model + python convert_with_ai.py slides.pptx slides.md --model anthropic/claude-sonnet-4.5 --prompt-type presentation + + # Use custom prompt with advanced vision model + python convert_with_ai.py diagram.png diagram.md --model anthropic/claude-sonnet-4.5 --custom-prompt "Describe this technical diagram" + + # Set API key via environment variable + export OPENROUTER_API_KEY="sk-or-v1-..." + python convert_with_ai.py image.jpg image.md + +Environment Variables: + OPENROUTER_API_KEY OpenRouter API key (required if not passed via --api-key) + +Popular Models (use with --model): + anthropic/claude-sonnet-4.5 - Recommended for scientific vision + anthropic/claude-opus-4.5 - Advanced vision model + openai/gpt-4o - GPT-4 Omni (vision support) + openai/gpt-4-vision - GPT-4 Vision + google/gemini-pro-vision - Gemini Pro Vision + """ + ) + + parser.add_argument('input', type=Path, help='Input file') + parser.add_argument('output', type=Path, help='Output Markdown file') + parser.add_argument( + '--api-key', '-k', + help='OpenRouter API key (or set OPENROUTER_API_KEY env var)' + ) + parser.add_argument( + '--model', '-m', + default='anthropic/claude-sonnet-4.5', + help='Model to use via OpenRouter (default: anthropic/claude-sonnet-4.5)' + ) + parser.add_argument( + '--prompt-type', '-t', + choices=list(PROMPTS.keys()), + default='general', + help='Type of prompt to use (default: general)' + ) + parser.add_argument( + '--custom-prompt', '-p', + help='Custom prompt (overrides --prompt-type)' + ) + parser.add_argument( + '--list-prompts', '-l', + action='store_true', + help='List available prompt types and exit' + ) + + args = parser.parse_args() + + # List prompts and exit + if args.list_prompts: + print("Available prompt types:\n") + for name, prompt in PROMPTS.items(): + print(f"[{name}]") + print(prompt) + print("\n" + "="*60 + "\n") + sys.exit(0) + + # Get API key + api_key = args.api_key or os.environ.get('OPENROUTER_API_KEY') + if not api_key: + print("Error: OpenRouter API key required. Set OPENROUTER_API_KEY environment variable or use --api-key") + print("Get your API key at: https://openrouter.ai/keys") + sys.exit(1) + + # Validate input file + if not args.input.exists(): + print(f"Error: Input file '{args.input}' does not exist") + sys.exit(1) + + # Convert file + success = convert_with_ai( + input_file=args.input, + output_file=args.output, + api_key=api_key, + model=args.model, + prompt_type=args.prompt_type, + custom_prompt=args.custom_prompt + ) + + sys.exit(0 if success else 1) + + +if __name__ == '__main__': + main() +