mirror of
https://github.com/K-Dense-AI/claude-scientific-skills.git
synced 2026-01-26 16:58:56 +08:00
Update all the latest writing skills
This commit is contained in:
@@ -1,317 +1,228 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Batch conversion utility for MarkItDown.
|
||||
Batch convert multiple files to Markdown using MarkItDown.
|
||||
|
||||
Converts all supported files in a directory to Markdown format.
|
||||
This script demonstrates how to efficiently convert multiple files
|
||||
in a directory to Markdown format.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from markitdown import MarkItDown
|
||||
from typing import Optional, List
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
from markitdown import MarkItDown
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import sys
|
||||
|
||||
|
||||
# Supported file extensions
|
||||
SUPPORTED_EXTENSIONS = {
|
||||
'.pdf', '.docx', '.pptx', '.xlsx', '.xls',
|
||||
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff',
|
||||
'.wav', '.mp3', '.flac', '.ogg', '.aiff',
|
||||
'.html', '.htm', '.epub',
|
||||
'.csv', '.json', '.xml',
|
||||
'.zip'
|
||||
}
|
||||
|
||||
|
||||
def setup_markitdown(
|
||||
use_llm: bool = False,
|
||||
llm_model: str = "gpt-4o",
|
||||
use_azure_di: bool = False,
|
||||
azure_endpoint: Optional[str] = None,
|
||||
azure_key: Optional[str] = None
|
||||
) -> MarkItDown:
|
||||
"""
|
||||
Setup MarkItDown instance with optional advanced features.
|
||||
|
||||
Args:
|
||||
use_llm: Enable LLM-powered image descriptions
|
||||
llm_model: LLM model to use (default: gpt-4o)
|
||||
use_azure_di: Enable Azure Document Intelligence
|
||||
azure_endpoint: Azure Document Intelligence endpoint
|
||||
azure_key: Azure Document Intelligence API key
|
||||
|
||||
Returns:
|
||||
Configured MarkItDown instance
|
||||
"""
|
||||
kwargs = {}
|
||||
|
||||
if use_llm:
|
||||
try:
|
||||
from openai import OpenAI
|
||||
client = OpenAI()
|
||||
kwargs['llm_client'] = client
|
||||
kwargs['llm_model'] = llm_model
|
||||
print(f"✓ LLM integration enabled ({llm_model})")
|
||||
except ImportError:
|
||||
print("✗ Warning: OpenAI not installed, LLM features disabled")
|
||||
print(" Install with: pip install openai")
|
||||
|
||||
if use_azure_di:
|
||||
if azure_endpoint and azure_key:
|
||||
kwargs['docintel_endpoint'] = azure_endpoint
|
||||
kwargs['docintel_key'] = azure_key
|
||||
print("✓ Azure Document Intelligence enabled")
|
||||
else:
|
||||
print("✗ Warning: Azure credentials not provided, Azure DI disabled")
|
||||
|
||||
return MarkItDown(**kwargs)
|
||||
|
||||
|
||||
def convert_file(
|
||||
md: MarkItDown,
|
||||
input_path: Path,
|
||||
output_dir: Path,
|
||||
verbose: bool = False
|
||||
) -> bool:
|
||||
def convert_file(md: MarkItDown, file_path: Path, output_dir: Path, verbose: bool = False) -> tuple[bool, str, str]:
|
||||
"""
|
||||
Convert a single file to Markdown.
|
||||
|
||||
|
||||
Args:
|
||||
md: MarkItDown instance
|
||||
input_path: Path to input file
|
||||
file_path: Path to input file
|
||||
output_dir: Directory for output files
|
||||
verbose: Print detailed progress
|
||||
|
||||
verbose: Print detailed messages
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
Tuple of (success, input_path, message)
|
||||
"""
|
||||
try:
|
||||
if verbose:
|
||||
print(f" Processing: {input_path.name}")
|
||||
|
||||
# Convert file
|
||||
result = md.convert(str(input_path))
|
||||
|
||||
# Create output filename
|
||||
output_filename = input_path.stem + '.md'
|
||||
output_path = output_dir / output_filename
|
||||
|
||||
# Write output
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(result.text_content)
|
||||
|
||||
if verbose:
|
||||
print(f" ✓ Converted: {input_path.name} → {output_filename}")
|
||||
|
||||
return True
|
||||
|
||||
print(f"Converting: {file_path}")
|
||||
|
||||
result = md.convert(str(file_path))
|
||||
|
||||
# Create output path
|
||||
output_file = output_dir / f"{file_path.stem}.md"
|
||||
|
||||
# Write content with metadata header
|
||||
content = f"# {result.title or file_path.stem}\n\n"
|
||||
content += f"**Source**: {file_path.name}\n"
|
||||
content += f"**Format**: {file_path.suffix}\n\n"
|
||||
content += "---\n\n"
|
||||
content += result.text_content
|
||||
|
||||
output_file.write_text(content, encoding='utf-8')
|
||||
|
||||
return True, str(file_path), f"✓ Converted to {output_file.name}"
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ Error converting {input_path.name}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def find_files(input_dir: Path, recursive: bool = False) -> List[Path]:
|
||||
"""
|
||||
Find all supported files in directory.
|
||||
|
||||
Args:
|
||||
input_dir: Directory to search
|
||||
recursive: Search subdirectories
|
||||
|
||||
Returns:
|
||||
List of file paths
|
||||
"""
|
||||
files = []
|
||||
|
||||
if recursive:
|
||||
for ext in SUPPORTED_EXTENSIONS:
|
||||
files.extend(input_dir.rglob(f"*{ext}"))
|
||||
else:
|
||||
for ext in SUPPORTED_EXTENSIONS:
|
||||
files.extend(input_dir.glob(f"*{ext}"))
|
||||
|
||||
return sorted(files)
|
||||
return False, str(file_path), f"✗ Error: {str(e)}"
|
||||
|
||||
|
||||
def batch_convert(
|
||||
input_dir: str,
|
||||
output_dir: str,
|
||||
input_dir: Path,
|
||||
output_dir: Path,
|
||||
extensions: Optional[List[str]] = None,
|
||||
recursive: bool = False,
|
||||
use_llm: bool = False,
|
||||
llm_model: str = "gpt-4o",
|
||||
use_azure_di: bool = False,
|
||||
azure_endpoint: Optional[str] = None,
|
||||
azure_key: Optional[str] = None,
|
||||
verbose: bool = False
|
||||
) -> None:
|
||||
workers: int = 4,
|
||||
verbose: bool = False,
|
||||
enable_plugins: bool = False
|
||||
) -> dict:
|
||||
"""
|
||||
Batch convert all supported files in a directory.
|
||||
|
||||
Batch convert files in a directory.
|
||||
|
||||
Args:
|
||||
input_dir: Input directory containing files
|
||||
output_dir: Output directory for Markdown files
|
||||
input_dir: Input directory
|
||||
output_dir: Output directory
|
||||
extensions: List of file extensions to convert (e.g., ['.pdf', '.docx'])
|
||||
recursive: Search subdirectories
|
||||
use_llm: Enable LLM-powered descriptions
|
||||
llm_model: LLM model to use
|
||||
use_azure_di: Enable Azure Document Intelligence
|
||||
azure_endpoint: Azure DI endpoint
|
||||
azure_key: Azure DI API key
|
||||
verbose: Print detailed progress
|
||||
workers: Number of parallel workers
|
||||
verbose: Print detailed messages
|
||||
enable_plugins: Enable MarkItDown plugins
|
||||
|
||||
Returns:
|
||||
Dictionary with conversion statistics
|
||||
"""
|
||||
input_path = Path(input_dir)
|
||||
output_path = Path(output_dir)
|
||||
|
||||
# Validate input directory
|
||||
if not input_path.exists():
|
||||
print(f"✗ Error: Input directory '{input_dir}' does not exist")
|
||||
sys.exit(1)
|
||||
|
||||
if not input_path.is_dir():
|
||||
print(f"✗ Error: '{input_dir}' is not a directory")
|
||||
sys.exit(1)
|
||||
|
||||
# Create output directory
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Setup MarkItDown
|
||||
print("Setting up MarkItDown...")
|
||||
md = setup_markitdown(
|
||||
use_llm=use_llm,
|
||||
llm_model=llm_model,
|
||||
use_azure_di=use_azure_di,
|
||||
azure_endpoint=azure_endpoint,
|
||||
azure_key=azure_key
|
||||
)
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Default extensions if not specified
|
||||
if extensions is None:
|
||||
extensions = ['.pdf', '.docx', '.pptx', '.xlsx', '.html', '.jpg', '.png']
|
||||
|
||||
# Find files
|
||||
print(f"\nScanning directory: {input_dir}")
|
||||
files = []
|
||||
if recursive:
|
||||
print(" (including subdirectories)")
|
||||
|
||||
files = find_files(input_path, recursive)
|
||||
|
||||
for ext in extensions:
|
||||
files.extend(input_dir.rglob(f"*{ext}"))
|
||||
else:
|
||||
for ext in extensions:
|
||||
files.extend(input_dir.glob(f"*{ext}"))
|
||||
|
||||
if not files:
|
||||
print("✗ No supported files found")
|
||||
print(f" Supported extensions: {', '.join(sorted(SUPPORTED_EXTENSIONS))}")
|
||||
sys.exit(0)
|
||||
|
||||
print(f"✓ Found {len(files)} file(s) to convert\n")
|
||||
|
||||
# Convert files
|
||||
successful = 0
|
||||
failed = 0
|
||||
|
||||
for file_path in files:
|
||||
if convert_file(md, file_path, output_path, verbose):
|
||||
successful += 1
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Conversion complete!")
|
||||
print(f" Successful: {successful}")
|
||||
print(f" Failed: {failed}")
|
||||
print(f" Output: {output_dir}")
|
||||
print(f"{'='*60}")
|
||||
print(f"No files found with extensions: {', '.join(extensions)}")
|
||||
return {'total': 0, 'success': 0, 'failed': 0}
|
||||
|
||||
print(f"Found {len(files)} file(s) to convert")
|
||||
|
||||
# Create MarkItDown instance
|
||||
md = MarkItDown(enable_plugins=enable_plugins)
|
||||
|
||||
# Convert files in parallel
|
||||
results = {
|
||||
'total': len(files),
|
||||
'success': 0,
|
||||
'failed': 0,
|
||||
'details': []
|
||||
}
|
||||
|
||||
with ThreadPoolExecutor(max_workers=workers) as executor:
|
||||
futures = {
|
||||
executor.submit(convert_file, md, file_path, output_dir, verbose): file_path
|
||||
for file_path in files
|
||||
}
|
||||
|
||||
for future in as_completed(futures):
|
||||
success, path, message = future.result()
|
||||
|
||||
if success:
|
||||
results['success'] += 1
|
||||
else:
|
||||
results['failed'] += 1
|
||||
|
||||
results['details'].append({
|
||||
'file': path,
|
||||
'success': success,
|
||||
'message': message
|
||||
})
|
||||
|
||||
print(message)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Batch convert files to Markdown using MarkItDown",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Basic usage
|
||||
python batch_convert.py documents/ output/
|
||||
|
||||
# Recursive conversion
|
||||
python batch_convert.py documents/ output/ --recursive
|
||||
|
||||
# With LLM-powered image descriptions
|
||||
python batch_convert.py documents/ output/ --llm
|
||||
|
||||
# With Azure Document Intelligence
|
||||
python batch_convert.py documents/ output/ --azure \\
|
||||
--azure-endpoint https://example.cognitiveservices.azure.com/ \\
|
||||
--azure-key YOUR-KEY
|
||||
|
||||
# All features enabled
|
||||
python batch_convert.py documents/ output/ --llm --azure \\
|
||||
--azure-endpoint $AZURE_ENDPOINT --azure-key $AZURE_KEY
|
||||
|
||||
Supported file types:
|
||||
Documents: PDF, DOCX, PPTX, XLSX, XLS
|
||||
Images: JPG, PNG, GIF, BMP, TIFF
|
||||
Audio: WAV, MP3, FLAC, OGG, AIFF
|
||||
Web: HTML, EPUB
|
||||
Data: CSV, JSON, XML
|
||||
Archives: ZIP
|
||||
# Convert all PDFs in a directory
|
||||
python batch_convert.py papers/ output/ --extensions .pdf
|
||||
|
||||
# Convert multiple formats recursively
|
||||
python batch_convert.py documents/ markdown/ --extensions .pdf .docx .pptx -r
|
||||
|
||||
# Use 8 parallel workers
|
||||
python batch_convert.py input/ output/ --workers 8
|
||||
|
||||
# Enable plugins
|
||||
python batch_convert.py input/ output/ --plugins
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
parser.add_argument('input_dir', type=Path, help='Input directory')
|
||||
parser.add_argument('output_dir', type=Path, help='Output directory')
|
||||
parser.add_argument(
|
||||
'input_dir',
|
||||
help='Input directory containing files to convert'
|
||||
'--extensions', '-e',
|
||||
nargs='+',
|
||||
help='File extensions to convert (e.g., .pdf .docx)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'output_dir',
|
||||
help='Output directory for Markdown files'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-r', '--recursive',
|
||||
'--recursive', '-r',
|
||||
action='store_true',
|
||||
help='Recursively search subdirectories'
|
||||
help='Search subdirectories recursively'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--llm',
|
||||
'--workers', '-w',
|
||||
type=int,
|
||||
default=4,
|
||||
help='Number of parallel workers (default: 4)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--verbose', '-v',
|
||||
action='store_true',
|
||||
help='Enable LLM-powered image descriptions (requires OpenAI API key)'
|
||||
help='Verbose output'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--llm-model',
|
||||
default='gpt-4o',
|
||||
help='LLM model to use (default: gpt-4o)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--azure',
|
||||
'--plugins', '-p',
|
||||
action='store_true',
|
||||
help='Enable Azure Document Intelligence for PDFs'
|
||||
help='Enable MarkItDown plugins'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--azure-endpoint',
|
||||
help='Azure Document Intelligence endpoint URL'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--azure-key',
|
||||
help='Azure Document Intelligence API key'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-v', '--verbose',
|
||||
action='store_true',
|
||||
help='Print detailed progress'
|
||||
)
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Environment variable fallbacks for Azure
|
||||
azure_endpoint = args.azure_endpoint or os.getenv('AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT')
|
||||
azure_key = args.azure_key or os.getenv('AZURE_DOCUMENT_INTELLIGENCE_KEY')
|
||||
|
||||
batch_convert(
|
||||
|
||||
# Validate input directory
|
||||
if not args.input_dir.exists():
|
||||
print(f"Error: Input directory '{args.input_dir}' does not exist")
|
||||
sys.exit(1)
|
||||
|
||||
if not args.input_dir.is_dir():
|
||||
print(f"Error: '{args.input_dir}' is not a directory")
|
||||
sys.exit(1)
|
||||
|
||||
# Run batch conversion
|
||||
results = batch_convert(
|
||||
input_dir=args.input_dir,
|
||||
output_dir=args.output_dir,
|
||||
extensions=args.extensions,
|
||||
recursive=args.recursive,
|
||||
use_llm=args.llm,
|
||||
llm_model=args.llm_model,
|
||||
use_azure_di=args.azure,
|
||||
azure_endpoint=azure_endpoint,
|
||||
azure_key=azure_key,
|
||||
verbose=args.verbose
|
||||
workers=args.workers,
|
||||
verbose=args.verbose,
|
||||
enable_plugins=args.plugins
|
||||
)
|
||||
|
||||
# Print summary
|
||||
print("\n" + "="*50)
|
||||
print("CONVERSION SUMMARY")
|
||||
print("="*50)
|
||||
print(f"Total files: {results['total']}")
|
||||
print(f"Successful: {results['success']}")
|
||||
print(f"Failed: {results['failed']}")
|
||||
print(f"Success rate: {results['success']/results['total']*100:.1f}%" if results['total'] > 0 else "N/A")
|
||||
|
||||
# Show failed files if any
|
||||
if results['failed'] > 0:
|
||||
print("\nFailed conversions:")
|
||||
for detail in results['details']:
|
||||
if not detail['success']:
|
||||
print(f" - {detail['file']}: {detail['message']}")
|
||||
|
||||
sys.exit(0 if results['failed'] == 0 else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
|
||||
283
scientific-skills/markitdown/scripts/convert_literature.py
Executable file
283
scientific-skills/markitdown/scripts/convert_literature.py
Executable file
@@ -0,0 +1,283 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Convert scientific literature PDFs to Markdown for analysis and review.
|
||||
|
||||
This script is specifically designed for converting academic papers,
|
||||
organizing them, and preparing them for literature review workflows.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
from markitdown import MarkItDown
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def extract_metadata_from_filename(filename: str) -> Dict[str, str]:
|
||||
"""
|
||||
Try to extract metadata from filename.
|
||||
Supports patterns like: Author_Year_Title.pdf
|
||||
"""
|
||||
metadata = {}
|
||||
|
||||
# Remove extension
|
||||
name = Path(filename).stem
|
||||
|
||||
# Try to extract year
|
||||
year_match = re.search(r'\b(19|20)\d{2}\b', name)
|
||||
if year_match:
|
||||
metadata['year'] = year_match.group()
|
||||
|
||||
# Split by underscores or dashes
|
||||
parts = re.split(r'[_\-]', name)
|
||||
if len(parts) >= 2:
|
||||
metadata['author'] = parts[0].replace('_', ' ')
|
||||
metadata['title'] = ' '.join(parts[1:]).replace('_', ' ')
|
||||
else:
|
||||
metadata['title'] = name.replace('_', ' ')
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
def convert_paper(
|
||||
md: MarkItDown,
|
||||
input_file: Path,
|
||||
output_dir: Path,
|
||||
organize_by_year: bool = False
|
||||
) -> tuple[bool, Dict]:
|
||||
"""
|
||||
Convert a single paper to Markdown with metadata extraction.
|
||||
|
||||
Args:
|
||||
md: MarkItDown instance
|
||||
input_file: Path to PDF file
|
||||
output_dir: Output directory
|
||||
organize_by_year: Organize into year subdirectories
|
||||
|
||||
Returns:
|
||||
Tuple of (success, metadata_dict)
|
||||
"""
|
||||
try:
|
||||
print(f"Converting: {input_file.name}")
|
||||
|
||||
# Convert to Markdown
|
||||
result = md.convert(str(input_file))
|
||||
|
||||
# Extract metadata from filename
|
||||
metadata = extract_metadata_from_filename(input_file.name)
|
||||
metadata['source_file'] = input_file.name
|
||||
metadata['converted_date'] = datetime.now().isoformat()
|
||||
|
||||
# Try to extract title from content if not in filename
|
||||
if 'title' not in metadata and result.title:
|
||||
metadata['title'] = result.title
|
||||
|
||||
# Create output path
|
||||
if organize_by_year and 'year' in metadata:
|
||||
output_subdir = output_dir / metadata['year']
|
||||
output_subdir.mkdir(parents=True, exist_ok=True)
|
||||
else:
|
||||
output_subdir = output_dir
|
||||
output_subdir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_file = output_subdir / f"{input_file.stem}.md"
|
||||
|
||||
# Create formatted Markdown with front matter
|
||||
content = "---\n"
|
||||
content += f"title: \"{metadata.get('title', input_file.stem)}\"\n"
|
||||
if 'author' in metadata:
|
||||
content += f"author: \"{metadata['author']}\"\n"
|
||||
if 'year' in metadata:
|
||||
content += f"year: {metadata['year']}\n"
|
||||
content += f"source: \"{metadata['source_file']}\"\n"
|
||||
content += f"converted: \"{metadata['converted_date']}\"\n"
|
||||
content += "---\n\n"
|
||||
|
||||
# Add title
|
||||
content += f"# {metadata.get('title', input_file.stem)}\n\n"
|
||||
|
||||
# Add metadata section
|
||||
content += "## Document Information\n\n"
|
||||
if 'author' in metadata:
|
||||
content += f"**Author**: {metadata['author']}\n"
|
||||
if 'year' in metadata:
|
||||
content += f"**Year**: {metadata['year']}\n"
|
||||
content += f"**Source File**: {metadata['source_file']}\n"
|
||||
content += f"**Converted**: {metadata['converted_date']}\n\n"
|
||||
content += "---\n\n"
|
||||
|
||||
# Add content
|
||||
content += result.text_content
|
||||
|
||||
# Write to file
|
||||
output_file.write_text(content, encoding='utf-8')
|
||||
|
||||
print(f"✓ Saved to: {output_file}")
|
||||
|
||||
return True, metadata
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error converting {input_file.name}: {str(e)}")
|
||||
return False, {'source_file': input_file.name, 'error': str(e)}
|
||||
|
||||
|
||||
def create_index(papers: List[Dict], output_dir: Path):
|
||||
"""Create an index/catalog of all converted papers."""
|
||||
|
||||
# Sort by year (if available) and title
|
||||
papers_sorted = sorted(
|
||||
papers,
|
||||
key=lambda x: (x.get('year', '9999'), x.get('title', ''))
|
||||
)
|
||||
|
||||
# Create Markdown index
|
||||
index_content = "# Literature Review Index\n\n"
|
||||
index_content += f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
|
||||
index_content += f"**Total Papers**: {len(papers)}\n\n"
|
||||
index_content += "---\n\n"
|
||||
|
||||
# Group by year
|
||||
by_year = {}
|
||||
for paper in papers_sorted:
|
||||
year = paper.get('year', 'Unknown')
|
||||
if year not in by_year:
|
||||
by_year[year] = []
|
||||
by_year[year].append(paper)
|
||||
|
||||
# Write by year
|
||||
for year in sorted(by_year.keys()):
|
||||
index_content += f"## {year}\n\n"
|
||||
for paper in by_year[year]:
|
||||
title = paper.get('title', paper.get('source_file', 'Unknown'))
|
||||
author = paper.get('author', 'Unknown Author')
|
||||
source = paper.get('source_file', '')
|
||||
|
||||
# Create link to markdown file
|
||||
md_file = Path(source).stem + ".md"
|
||||
if 'year' in paper and paper['year'] != 'Unknown':
|
||||
md_file = f"{paper['year']}/{md_file}"
|
||||
|
||||
index_content += f"- **{title}**\n"
|
||||
index_content += f" - Author: {author}\n"
|
||||
index_content += f" - Source: {source}\n"
|
||||
index_content += f" - [Read Markdown]({md_file})\n\n"
|
||||
|
||||
# Write index
|
||||
index_file = output_dir / "INDEX.md"
|
||||
index_file.write_text(index_content, encoding='utf-8')
|
||||
print(f"\n✓ Created index: {index_file}")
|
||||
|
||||
# Also create JSON catalog
|
||||
catalog_file = output_dir / "catalog.json"
|
||||
with open(catalog_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(papers_sorted, f, indent=2, ensure_ascii=False)
|
||||
print(f"✓ Created catalog: {catalog_file}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert scientific literature PDFs to Markdown",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Convert all PDFs in a directory
|
||||
python convert_literature.py papers/ output/
|
||||
|
||||
# Organize by year
|
||||
python convert_literature.py papers/ output/ --organize-by-year
|
||||
|
||||
# Create index of all papers
|
||||
python convert_literature.py papers/ output/ --create-index
|
||||
|
||||
Filename Conventions:
|
||||
For best results, name your PDFs using this pattern:
|
||||
Author_Year_Title.pdf
|
||||
|
||||
Examples:
|
||||
Smith_2023_Machine_Learning_Applications.pdf
|
||||
Jones_2022_Climate_Change_Analysis.pdf
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('input_dir', type=Path, help='Directory with PDF files')
|
||||
parser.add_argument('output_dir', type=Path, help='Output directory for Markdown files')
|
||||
parser.add_argument(
|
||||
'--organize-by-year', '-y',
|
||||
action='store_true',
|
||||
help='Organize output into year subdirectories'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--create-index', '-i',
|
||||
action='store_true',
|
||||
help='Create an index/catalog of all papers'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--recursive', '-r',
|
||||
action='store_true',
|
||||
help='Search subdirectories recursively'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate input
|
||||
if not args.input_dir.exists():
|
||||
print(f"Error: Input directory '{args.input_dir}' does not exist")
|
||||
sys.exit(1)
|
||||
|
||||
if not args.input_dir.is_dir():
|
||||
print(f"Error: '{args.input_dir}' is not a directory")
|
||||
sys.exit(1)
|
||||
|
||||
# Find PDF files
|
||||
if args.recursive:
|
||||
pdf_files = list(args.input_dir.rglob("*.pdf"))
|
||||
else:
|
||||
pdf_files = list(args.input_dir.glob("*.pdf"))
|
||||
|
||||
if not pdf_files:
|
||||
print("No PDF files found")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Found {len(pdf_files)} PDF file(s)")
|
||||
|
||||
# Create MarkItDown instance
|
||||
md = MarkItDown()
|
||||
|
||||
# Convert all papers
|
||||
results = []
|
||||
success_count = 0
|
||||
|
||||
for pdf_file in pdf_files:
|
||||
success, metadata = convert_paper(
|
||||
md,
|
||||
pdf_file,
|
||||
args.output_dir,
|
||||
args.organize_by_year
|
||||
)
|
||||
|
||||
if success:
|
||||
success_count += 1
|
||||
results.append(metadata)
|
||||
|
||||
# Create index if requested
|
||||
if args.create_index and results:
|
||||
create_index(results, args.output_dir)
|
||||
|
||||
# Print summary
|
||||
print("\n" + "="*50)
|
||||
print("CONVERSION SUMMARY")
|
||||
print("="*50)
|
||||
print(f"Total papers: {len(pdf_files)}")
|
||||
print(f"Successful: {success_count}")
|
||||
print(f"Failed: {len(pdf_files) - success_count}")
|
||||
print(f"Success rate: {success_count/len(pdf_files)*100:.1f}%")
|
||||
|
||||
sys.exit(0 if success_count == len(pdf_files) else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
243
scientific-skills/markitdown/scripts/convert_with_ai.py
Executable file
243
scientific-skills/markitdown/scripts/convert_with_ai.py
Executable file
@@ -0,0 +1,243 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Convert documents to Markdown with AI-enhanced image descriptions.
|
||||
|
||||
This script demonstrates how to use MarkItDown with OpenRouter to generate
|
||||
detailed descriptions of images in documents (PowerPoint, PDFs with images, etc.)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from markitdown import MarkItDown
|
||||
from openai import OpenAI
|
||||
|
||||
|
||||
# Predefined prompts for different use cases
|
||||
PROMPTS = {
|
||||
'scientific': """
|
||||
Analyze this scientific image or diagram. Provide:
|
||||
1. Type of visualization (graph, chart, microscopy, diagram, etc.)
|
||||
2. Key data points, trends, or patterns
|
||||
3. Axes labels, legends, and scales
|
||||
4. Notable features or findings
|
||||
5. Scientific context and significance
|
||||
Be precise, technical, and detailed.
|
||||
""".strip(),
|
||||
|
||||
'presentation': """
|
||||
Describe this presentation slide image. Include:
|
||||
1. Main visual elements and their arrangement
|
||||
2. Key points or messages conveyed
|
||||
3. Data or information presented
|
||||
4. Visual hierarchy and emphasis
|
||||
Keep the description clear and informative.
|
||||
""".strip(),
|
||||
|
||||
'general': """
|
||||
Describe this image in detail. Include:
|
||||
1. Main subjects and objects
|
||||
2. Visual composition and layout
|
||||
3. Text content (if any)
|
||||
4. Notable details
|
||||
5. Overall context and purpose
|
||||
Be comprehensive and accurate.
|
||||
""".strip(),
|
||||
|
||||
'data_viz': """
|
||||
Analyze this data visualization. Provide:
|
||||
1. Type of chart/graph (bar, line, scatter, pie, etc.)
|
||||
2. Variables and axes
|
||||
3. Data ranges and scales
|
||||
4. Key patterns, trends, or outliers
|
||||
5. Statistical insights
|
||||
Focus on quantitative accuracy.
|
||||
""".strip(),
|
||||
|
||||
'medical': """
|
||||
Describe this medical image. Include:
|
||||
1. Type of medical imaging (X-ray, MRI, CT, microscopy, etc.)
|
||||
2. Anatomical structures visible
|
||||
3. Notable findings or abnormalities
|
||||
4. Image quality and contrast
|
||||
5. Clinical relevance
|
||||
Be professional and precise.
|
||||
""".strip()
|
||||
}
|
||||
|
||||
|
||||
def convert_with_ai(
|
||||
input_file: Path,
|
||||
output_file: Path,
|
||||
api_key: str,
|
||||
model: str = "anthropic/claude-sonnet-4.5",
|
||||
prompt_type: str = "general",
|
||||
custom_prompt: str = None
|
||||
) -> bool:
|
||||
"""
|
||||
Convert a file to Markdown with AI image descriptions.
|
||||
|
||||
Args:
|
||||
input_file: Path to input file
|
||||
output_file: Path to output Markdown file
|
||||
api_key: OpenRouter API key
|
||||
model: Model name (default: anthropic/claude-sonnet-4.5)
|
||||
prompt_type: Type of prompt to use
|
||||
custom_prompt: Custom prompt (overrides prompt_type)
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Initialize OpenRouter client (OpenAI-compatible)
|
||||
client = OpenAI(
|
||||
api_key=api_key,
|
||||
base_url="https://openrouter.ai/api/v1"
|
||||
)
|
||||
|
||||
# Select prompt
|
||||
if custom_prompt:
|
||||
prompt = custom_prompt
|
||||
else:
|
||||
prompt = PROMPTS.get(prompt_type, PROMPTS['general'])
|
||||
|
||||
print(f"Using model: {model}")
|
||||
print(f"Prompt type: {prompt_type if not custom_prompt else 'custom'}")
|
||||
print(f"Converting: {input_file}")
|
||||
|
||||
# Create MarkItDown with AI support
|
||||
md = MarkItDown(
|
||||
llm_client=client,
|
||||
llm_model=model,
|
||||
llm_prompt=prompt
|
||||
)
|
||||
|
||||
# Convert file
|
||||
result = md.convert(str(input_file))
|
||||
|
||||
# Create output with metadata
|
||||
content = f"# {result.title or input_file.stem}\n\n"
|
||||
content += f"**Source**: {input_file.name}\n"
|
||||
content += f"**Format**: {input_file.suffix}\n"
|
||||
content += f"**AI Model**: {model}\n"
|
||||
content += f"**Prompt Type**: {prompt_type if not custom_prompt else 'custom'}\n\n"
|
||||
content += "---\n\n"
|
||||
content += result.text_content
|
||||
|
||||
# Write output
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_file.write_text(content, encoding='utf-8')
|
||||
|
||||
print(f"✓ Successfully converted to: {output_file}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {str(e)}", file=sys.stderr)
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert documents to Markdown with AI-enhanced image descriptions",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=f"""
|
||||
Available prompt types:
|
||||
scientific - For scientific diagrams, graphs, and charts
|
||||
presentation - For presentation slides
|
||||
general - General-purpose image description
|
||||
data_viz - For data visualizations and charts
|
||||
medical - For medical imaging
|
||||
|
||||
Examples:
|
||||
# Convert a scientific paper
|
||||
python convert_with_ai.py paper.pdf output.md --prompt-type scientific
|
||||
|
||||
# Convert a presentation with custom model
|
||||
python convert_with_ai.py slides.pptx slides.md --model anthropic/claude-sonnet-4.5 --prompt-type presentation
|
||||
|
||||
# Use custom prompt with advanced vision model
|
||||
python convert_with_ai.py diagram.png diagram.md --model anthropic/claude-sonnet-4.5 --custom-prompt "Describe this technical diagram"
|
||||
|
||||
# Set API key via environment variable
|
||||
export OPENROUTER_API_KEY="sk-or-v1-..."
|
||||
python convert_with_ai.py image.jpg image.md
|
||||
|
||||
Environment Variables:
|
||||
OPENROUTER_API_KEY OpenRouter API key (required if not passed via --api-key)
|
||||
|
||||
Popular Models (use with --model):
|
||||
anthropic/claude-sonnet-4.5 - Recommended for scientific vision
|
||||
anthropic/claude-opus-4.5 - Advanced vision model
|
||||
openai/gpt-4o - GPT-4 Omni (vision support)
|
||||
openai/gpt-4-vision - GPT-4 Vision
|
||||
google/gemini-pro-vision - Gemini Pro Vision
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('input', type=Path, help='Input file')
|
||||
parser.add_argument('output', type=Path, help='Output Markdown file')
|
||||
parser.add_argument(
|
||||
'--api-key', '-k',
|
||||
help='OpenRouter API key (or set OPENROUTER_API_KEY env var)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--model', '-m',
|
||||
default='anthropic/claude-sonnet-4.5',
|
||||
help='Model to use via OpenRouter (default: anthropic/claude-sonnet-4.5)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--prompt-type', '-t',
|
||||
choices=list(PROMPTS.keys()),
|
||||
default='general',
|
||||
help='Type of prompt to use (default: general)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--custom-prompt', '-p',
|
||||
help='Custom prompt (overrides --prompt-type)'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--list-prompts', '-l',
|
||||
action='store_true',
|
||||
help='List available prompt types and exit'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# List prompts and exit
|
||||
if args.list_prompts:
|
||||
print("Available prompt types:\n")
|
||||
for name, prompt in PROMPTS.items():
|
||||
print(f"[{name}]")
|
||||
print(prompt)
|
||||
print("\n" + "="*60 + "\n")
|
||||
sys.exit(0)
|
||||
|
||||
# Get API key
|
||||
api_key = args.api_key or os.environ.get('OPENROUTER_API_KEY')
|
||||
if not api_key:
|
||||
print("Error: OpenRouter API key required. Set OPENROUTER_API_KEY environment variable or use --api-key")
|
||||
print("Get your API key at: https://openrouter.ai/keys")
|
||||
sys.exit(1)
|
||||
|
||||
# Validate input file
|
||||
if not args.input.exists():
|
||||
print(f"Error: Input file '{args.input}' does not exist")
|
||||
sys.exit(1)
|
||||
|
||||
# Convert file
|
||||
success = convert_with_ai(
|
||||
input_file=args.input,
|
||||
output_file=args.output,
|
||||
api_key=api_key,
|
||||
model=args.model,
|
||||
prompt_type=args.prompt_type,
|
||||
custom_prompt=args.custom_prompt
|
||||
)
|
||||
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user