Initial commit for research-lookup
This commit is contained in:
474
scripts/research_lookup.py
Executable file
474
scripts/research_lookup.py
Executable file
@@ -0,0 +1,474 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Research Information Lookup Tool
|
||||
Uses Perplexity's Sonar Pro Search model through OpenRouter for academic research queries.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Any
|
||||
from urllib.parse import quote
|
||||
|
||||
|
||||
class ResearchLookup:
|
||||
"""Research information lookup using Perplexity Sonar models via OpenRouter."""
|
||||
|
||||
# Available models
|
||||
MODELS = {
|
||||
"pro": "perplexity/sonar-pro", # Fast lookup, cost-effective
|
||||
"reasoning": "perplexity/sonar-reasoning-pro", # Deep analysis with reasoning
|
||||
}
|
||||
|
||||
# Keywords that indicate complex queries requiring reasoning model
|
||||
REASONING_KEYWORDS = [
|
||||
"compare", "contrast", "analyze", "analysis", "evaluate", "critique",
|
||||
"versus", "vs", "vs.", "compared to", "differences between", "similarities",
|
||||
"meta-analysis", "systematic review", "synthesis", "integrate",
|
||||
"mechanism", "why", "how does", "how do", "explain", "relationship",
|
||||
"theoretical framework", "implications", "interpret", "reasoning",
|
||||
"controversy", "conflicting", "paradox", "debate", "reconcile",
|
||||
"pros and cons", "advantages and disadvantages", "trade-off", "tradeoff",
|
||||
]
|
||||
|
||||
def __init__(self, force_model: Optional[str] = None):
|
||||
"""
|
||||
Initialize the research lookup tool.
|
||||
|
||||
Args:
|
||||
force_model: Optional model override ('pro' or 'reasoning').
|
||||
If None, model is auto-selected based on query complexity.
|
||||
"""
|
||||
self.api_key = os.getenv("OPENROUTER_API_KEY")
|
||||
if not self.api_key:
|
||||
raise ValueError("OPENROUTER_API_KEY environment variable not set")
|
||||
|
||||
self.base_url = "https://openrouter.ai/api/v1"
|
||||
self.force_model = force_model
|
||||
self.headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
"HTTP-Referer": "https://scientific-writer.local",
|
||||
"X-Title": "Scientific Writer Research Tool"
|
||||
}
|
||||
|
||||
def _select_model(self, query: str) -> str:
|
||||
"""
|
||||
Select the appropriate model based on query complexity.
|
||||
|
||||
Args:
|
||||
query: The research query
|
||||
|
||||
Returns:
|
||||
Model identifier string
|
||||
"""
|
||||
if self.force_model:
|
||||
return self.MODELS.get(self.force_model, self.MODELS["reasoning"])
|
||||
|
||||
# Check for reasoning keywords (case-insensitive)
|
||||
query_lower = query.lower()
|
||||
for keyword in self.REASONING_KEYWORDS:
|
||||
if keyword in query_lower:
|
||||
return self.MODELS["reasoning"]
|
||||
|
||||
# Check for multiple questions or complex structure
|
||||
question_count = query.count("?")
|
||||
if question_count >= 2:
|
||||
return self.MODELS["reasoning"]
|
||||
|
||||
# Check for very long queries (likely complex)
|
||||
if len(query) > 200:
|
||||
return self.MODELS["reasoning"]
|
||||
|
||||
# Default to pro for simple lookups
|
||||
return self.MODELS["pro"]
|
||||
|
||||
def _make_request(self, messages: List[Dict[str, str]], model: str, **kwargs) -> Dict[str, Any]:
|
||||
"""Make a request to the OpenRouter API with academic search mode."""
|
||||
data = {
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"max_tokens": 8000,
|
||||
"temperature": 0.1, # Low temperature for factual research
|
||||
# Perplexity-specific parameters for academic search
|
||||
"search_mode": "academic", # Prioritize scholarly sources (peer-reviewed papers, journals)
|
||||
"search_context_size": "high", # Always use high context for deeper research
|
||||
**kwargs
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{self.base_url}/chat/completions",
|
||||
headers=self.headers,
|
||||
json=data,
|
||||
timeout=90 # Increased timeout for academic search
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except requests.exceptions.RequestException as e:
|
||||
raise Exception(f"API request failed: {str(e)}")
|
||||
|
||||
def _format_research_prompt(self, query: str) -> str:
|
||||
"""Format the query for optimal research results."""
|
||||
return f"""You are an expert research assistant. Please provide comprehensive, accurate research information for the following query: "{query}"
|
||||
|
||||
IMPORTANT INSTRUCTIONS:
|
||||
1. Focus on ACADEMIC and SCIENTIFIC sources (peer-reviewed papers, reputable journals, institutional research)
|
||||
2. Include RECENT information (prioritize 2020-2026 publications)
|
||||
3. Provide COMPLETE citations with authors, title, journal/conference, year, and DOI when available
|
||||
4. Structure your response with clear sections and proper attribution
|
||||
5. Be comprehensive but concise - aim for 800-1200 words
|
||||
6. Include key findings, methodologies, and implications when relevant
|
||||
7. Note any controversies, limitations, or conflicting evidence
|
||||
|
||||
PAPER QUALITY AND POPULARITY PRIORITIZATION (CRITICAL):
|
||||
8. ALWAYS prioritize HIGHLY-CITED papers over obscure publications:
|
||||
- Recent papers (0-3 years): prefer 20+ citations, highlight 100+ as highly influential
|
||||
- Mid-age papers (3-7 years): prefer 100+ citations, highlight 500+ as landmark
|
||||
- Older papers (7+ years): prefer 500+ citations, highlight 1000+ as foundational
|
||||
9. ALWAYS prioritize papers from TOP-TIER VENUES:
|
||||
- Tier 1 (highest priority): Nature, Science, Cell, NEJM, Lancet, JAMA, PNAS, Nature Medicine, Nature Biotechnology
|
||||
- Tier 2 (high priority): High-impact specialized journals (IF>10), top conferences (NeurIPS, ICML, ICLR for AI/ML)
|
||||
- Tier 3: Respected specialized journals (IF 5-10)
|
||||
- Only cite lower-tier venues if directly relevant AND no better source exists
|
||||
10. PREFER papers from ESTABLISHED, REPUTABLE AUTHORS:
|
||||
- Senior researchers with high h-index and multiple high-impact publications
|
||||
- Leading research groups at recognized institutions
|
||||
- Authors with recognized expertise (awards, editorial positions)
|
||||
11. For EACH citation, include when available:
|
||||
- Approximate citation count (e.g., "cited 500+ times")
|
||||
- Journal/venue tier indicator
|
||||
- Notable author credentials if relevant
|
||||
12. PRIORITIZE papers that DIRECTLY address the research question over tangentially related work
|
||||
|
||||
RESPONSE FORMAT:
|
||||
- Start with a brief summary (2-3 sentences)
|
||||
- Present key findings and studies in organized sections
|
||||
- Rank papers by impact: most influential/cited first
|
||||
- End with future directions or research gaps if applicable
|
||||
- Include 5-8 high-quality citations, emphasizing Tier-1 venues and highly-cited papers
|
||||
|
||||
Remember: Quality over quantity. Prioritize influential, highly-cited papers from prestigious venues and established researchers."""
|
||||
|
||||
def lookup(self, query: str) -> Dict[str, Any]:
|
||||
"""Perform a research lookup for the given query."""
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
# Select model based on query complexity
|
||||
model = self._select_model(query)
|
||||
|
||||
# Format the research prompt
|
||||
research_prompt = self._format_research_prompt(query)
|
||||
|
||||
# Prepare messages for the API with system message for academic mode
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": """You are an academic research assistant specializing in finding HIGH-IMPACT, INFLUENTIAL research.
|
||||
|
||||
QUALITY PRIORITIZATION (CRITICAL):
|
||||
- ALWAYS prefer highly-cited papers over obscure publications
|
||||
- ALWAYS prioritize Tier-1 venues: Nature, Science, Cell, NEJM, Lancet, JAMA, PNAS, and their family journals
|
||||
- ALWAYS prefer papers from established researchers with strong publication records
|
||||
- Include citation counts when known (e.g., "cited 500+ times")
|
||||
- Quality matters more than quantity - 5 excellent papers beats 10 mediocre ones
|
||||
|
||||
VENUE HIERARCHY:
|
||||
1. Nature/Science/Cell family, NEJM, Lancet, JAMA (highest priority)
|
||||
2. High-impact specialized journals (IF>10), top ML conferences (NeurIPS, ICML, ICLR)
|
||||
3. Respected field-specific journals (IF 5-10)
|
||||
4. Other peer-reviewed sources (only if no better option exists)
|
||||
|
||||
Focus exclusively on scholarly sources: peer-reviewed journals, academic papers, research institutions. Prioritize recent academic literature (2020-2026) and provide complete citations with DOIs. Always indicate paper impact through citation counts and venue prestige."""
|
||||
},
|
||||
{"role": "user", "content": research_prompt}
|
||||
]
|
||||
|
||||
try:
|
||||
# Make the API request
|
||||
response = self._make_request(messages, model)
|
||||
|
||||
# Extract the response content
|
||||
if "choices" in response and len(response["choices"]) > 0:
|
||||
choice = response["choices"][0]
|
||||
if "message" in choice and "content" in choice["message"]:
|
||||
content = choice["message"]["content"]
|
||||
|
||||
# Extract citations from API response (Perplexity provides these)
|
||||
api_citations = self._extract_api_citations(response, choice)
|
||||
|
||||
# Also extract citations from text as fallback
|
||||
text_citations = self._extract_citations_from_text(content)
|
||||
|
||||
# Combine: prioritize API citations, add text citations if no duplicates
|
||||
citations = api_citations + text_citations
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"query": query,
|
||||
"response": content,
|
||||
"citations": citations,
|
||||
"sources": api_citations, # Separate field for API-provided sources
|
||||
"timestamp": timestamp,
|
||||
"model": model,
|
||||
"usage": response.get("usage", {})
|
||||
}
|
||||
else:
|
||||
raise Exception("Invalid response format from API")
|
||||
else:
|
||||
raise Exception("No response choices received from API")
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"query": query,
|
||||
"error": str(e),
|
||||
"timestamp": timestamp,
|
||||
"model": model
|
||||
}
|
||||
|
||||
def _extract_api_citations(self, response: Dict[str, Any], choice: Dict[str, Any]) -> List[Dict[str, str]]:
|
||||
"""Extract citations from Perplexity API response fields."""
|
||||
citations = []
|
||||
|
||||
# Perplexity returns citations in search_results field (new format)
|
||||
# Check multiple possible locations where OpenRouter might place them
|
||||
search_results = (
|
||||
response.get("search_results") or
|
||||
choice.get("search_results") or
|
||||
choice.get("message", {}).get("search_results") or
|
||||
[]
|
||||
)
|
||||
|
||||
for result in search_results:
|
||||
citation = {
|
||||
"type": "source",
|
||||
"title": result.get("title", ""),
|
||||
"url": result.get("url", ""),
|
||||
"date": result.get("date", ""),
|
||||
}
|
||||
# Add snippet if available (newer API feature)
|
||||
if result.get("snippet"):
|
||||
citation["snippet"] = result.get("snippet")
|
||||
citations.append(citation)
|
||||
|
||||
# Also check for legacy citations field (backward compatibility)
|
||||
legacy_citations = (
|
||||
response.get("citations") or
|
||||
choice.get("citations") or
|
||||
choice.get("message", {}).get("citations") or
|
||||
[]
|
||||
)
|
||||
|
||||
for url in legacy_citations:
|
||||
if isinstance(url, str):
|
||||
# Legacy format was just URLs
|
||||
citations.append({
|
||||
"type": "source",
|
||||
"url": url,
|
||||
"title": "",
|
||||
"date": ""
|
||||
})
|
||||
elif isinstance(url, dict):
|
||||
citations.append({
|
||||
"type": "source",
|
||||
"url": url.get("url", ""),
|
||||
"title": url.get("title", ""),
|
||||
"date": url.get("date", "")
|
||||
})
|
||||
|
||||
return citations
|
||||
|
||||
def _extract_citations_from_text(self, text: str) -> List[Dict[str, str]]:
|
||||
"""Extract potential citations from the response text as fallback."""
|
||||
import re
|
||||
citations = []
|
||||
|
||||
# Look for DOI patterns first (most reliable)
|
||||
# Matches: doi:10.xxx, DOI: 10.xxx, https://doi.org/10.xxx
|
||||
doi_pattern = r'(?:doi[:\s]*|https?://(?:dx\.)?doi\.org/)(10\.[0-9]{4,}/[^\s\)\]\,\[\<\>]+)'
|
||||
doi_matches = re.findall(doi_pattern, text, re.IGNORECASE)
|
||||
seen_dois = set()
|
||||
|
||||
for doi in doi_matches:
|
||||
# Clean up DOI - remove trailing punctuation and brackets
|
||||
doi_clean = doi.strip().rstrip('.,;:)]')
|
||||
if doi_clean and doi_clean not in seen_dois:
|
||||
seen_dois.add(doi_clean)
|
||||
citations.append({
|
||||
"type": "doi",
|
||||
"doi": doi_clean,
|
||||
"url": f"https://doi.org/{doi_clean}"
|
||||
})
|
||||
|
||||
# Look for URLs that might be sources
|
||||
url_pattern = r'https?://[^\s\)\]\,\<\>\"\']+(?:arxiv\.org|pubmed|ncbi\.nlm\.nih\.gov|nature\.com|science\.org|wiley\.com|springer\.com|ieee\.org|acm\.org)[^\s\)\]\,\<\>\"\']*'
|
||||
url_matches = re.findall(url_pattern, text, re.IGNORECASE)
|
||||
seen_urls = set()
|
||||
|
||||
for url in url_matches:
|
||||
url_clean = url.rstrip('.')
|
||||
if url_clean not in seen_urls:
|
||||
seen_urls.add(url_clean)
|
||||
citations.append({
|
||||
"type": "url",
|
||||
"url": url_clean
|
||||
})
|
||||
|
||||
return citations
|
||||
|
||||
def batch_lookup(self, queries: List[str], delay: float = 1.0) -> List[Dict[str, Any]]:
|
||||
"""Perform multiple research lookups with optional delay between requests."""
|
||||
results = []
|
||||
|
||||
for i, query in enumerate(queries):
|
||||
if i > 0 and delay > 0:
|
||||
time.sleep(delay) # Rate limiting
|
||||
|
||||
result = self.lookup(query)
|
||||
results.append(result)
|
||||
|
||||
# Print progress
|
||||
print(f"[Research] Completed query {i+1}/{len(queries)}: {query[:50]}...")
|
||||
|
||||
return results
|
||||
|
||||
def get_model_info(self) -> Dict[str, Any]:
|
||||
"""Get information about available models from OpenRouter."""
|
||||
try:
|
||||
response = requests.get(
|
||||
f"{self.base_url}/models",
|
||||
headers=self.headers,
|
||||
timeout=30
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
def main():
|
||||
"""Command-line interface for testing the research lookup tool."""
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
parser = argparse.ArgumentParser(description="Research Information Lookup Tool")
|
||||
parser.add_argument("query", nargs="?", help="Research query to look up")
|
||||
parser.add_argument("--model-info", action="store_true", help="Show available models")
|
||||
parser.add_argument("--batch", nargs="+", help="Run multiple queries")
|
||||
parser.add_argument("--force-model", choices=["pro", "reasoning"],
|
||||
help="Force specific model: 'pro' for fast lookup, 'reasoning' for deep analysis")
|
||||
parser.add_argument("-o", "--output", help="Write output to file instead of stdout")
|
||||
parser.add_argument("--json", action="store_true", help="Output results as JSON")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set up output destination
|
||||
output_file = None
|
||||
if args.output:
|
||||
output_file = open(args.output, 'w', encoding='utf-8')
|
||||
|
||||
def write_output(text):
|
||||
"""Write to file or stdout."""
|
||||
if output_file:
|
||||
output_file.write(text + '\n')
|
||||
else:
|
||||
print(text)
|
||||
|
||||
# Check for API key
|
||||
if not os.getenv("OPENROUTER_API_KEY"):
|
||||
print("Error: OPENROUTER_API_KEY environment variable not set", file=sys.stderr)
|
||||
print("Please set it in your .env file or export it:", file=sys.stderr)
|
||||
print(" export OPENROUTER_API_KEY='your_openrouter_api_key'", file=sys.stderr)
|
||||
if output_file:
|
||||
output_file.close()
|
||||
return 1
|
||||
|
||||
try:
|
||||
research = ResearchLookup(force_model=args.force_model)
|
||||
|
||||
if args.model_info:
|
||||
write_output("Available models from OpenRouter:")
|
||||
models = research.get_model_info()
|
||||
if "data" in models:
|
||||
for model in models["data"]:
|
||||
if "perplexity" in model["id"].lower():
|
||||
write_output(f" - {model['id']}: {model.get('name', 'N/A')}")
|
||||
if output_file:
|
||||
output_file.close()
|
||||
return 0
|
||||
|
||||
if not args.query and not args.batch:
|
||||
print("Error: No query provided. Use --model-info to see available models.", file=sys.stderr)
|
||||
if output_file:
|
||||
output_file.close()
|
||||
return 1
|
||||
|
||||
if args.batch:
|
||||
print(f"Running batch research for {len(args.batch)} queries...", file=sys.stderr)
|
||||
results = research.batch_lookup(args.batch)
|
||||
else:
|
||||
print(f"Researching: {args.query}", file=sys.stderr)
|
||||
results = [research.lookup(args.query)]
|
||||
|
||||
# Output as JSON if requested
|
||||
if args.json:
|
||||
write_output(json.dumps(results, indent=2, ensure_ascii=False))
|
||||
if output_file:
|
||||
output_file.close()
|
||||
return 0
|
||||
|
||||
# Display results in human-readable format
|
||||
for i, result in enumerate(results):
|
||||
if result["success"]:
|
||||
write_output(f"\n{'='*80}")
|
||||
write_output(f"Query {i+1}: {result['query']}")
|
||||
write_output(f"Timestamp: {result['timestamp']}")
|
||||
write_output(f"Model: {result['model']}")
|
||||
write_output(f"{'='*80}")
|
||||
write_output(result["response"])
|
||||
|
||||
# Display API-provided sources first (most reliable)
|
||||
sources = result.get("sources", [])
|
||||
if sources:
|
||||
write_output(f"\n📚 Sources ({len(sources)}):")
|
||||
for j, source in enumerate(sources):
|
||||
title = source.get("title", "Untitled")
|
||||
url = source.get("url", "")
|
||||
date = source.get("date", "")
|
||||
date_str = f" ({date})" if date else ""
|
||||
write_output(f" [{j+1}] {title}{date_str}")
|
||||
if url:
|
||||
write_output(f" {url}")
|
||||
|
||||
# Display additional text-extracted citations
|
||||
citations = result.get("citations", [])
|
||||
text_citations = [c for c in citations if c.get("type") in ("doi", "url")]
|
||||
if text_citations:
|
||||
write_output(f"\n🔗 Additional References ({len(text_citations)}):")
|
||||
for j, citation in enumerate(text_citations):
|
||||
if citation.get("type") == "doi":
|
||||
write_output(f" [{j+1}] DOI: {citation.get('doi', '')} - {citation.get('url', '')}")
|
||||
elif citation.get("type") == "url":
|
||||
write_output(f" [{j+1}] {citation.get('url', '')}")
|
||||
|
||||
if result.get("usage"):
|
||||
write_output(f"\nUsage: {result['usage']}")
|
||||
else:
|
||||
write_output(f"\nError in query {i+1}: {result['error']}")
|
||||
|
||||
if output_file:
|
||||
output_file.close()
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {str(e)}", file=sys.stderr)
|
||||
if output_file:
|
||||
output_file.close()
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
Reference in New Issue
Block a user