Added parallel-web skill

Refactor research lookup skill to enhance backend routing and update documentation. The skill now intelligently selects between the Parallel Chat API and Perplexity sonar-pro-search based on query type. Added compatibility notes, license information, and improved descriptions for clarity. Removed outdated example scripts to streamline the codebase.
This commit is contained in:
Vinayak Agarwal
2026-03-01 07:36:19 -08:00
parent 29c869326e
commit f72b7f4521
13 changed files with 3969 additions and 769 deletions

View File

@@ -1,174 +0,0 @@
#!/usr/bin/env python3
"""
Example usage of the Research Lookup skill with automatic model selection.
This script demonstrates:
1. Automatic model selection based on query complexity
2. Manual model override options
3. Batch query processing
4. Integration with scientific writing workflows
"""
import os
from research_lookup import ResearchLookup
def example_automatic_selection():
"""Demonstrate automatic model selection."""
print("=" * 80)
print("EXAMPLE 1: Automatic Model Selection")
print("=" * 80)
print()
research = ResearchLookup()
# Simple lookup - will use Sonar Pro Search
query1 = "Recent advances in CRISPR gene editing 2024"
print(f"Query: {query1}")
print(f"Expected model: Sonar Pro Search (fast lookup)")
result1 = research.lookup(query1)
print(f"Actual model: {result1.get('model')}")
print()
# Complex analysis - will use Sonar Reasoning Pro
query2 = "Compare and contrast the efficacy of mRNA vaccines versus traditional vaccines"
print(f"Query: {query2}")
print(f"Expected model: Sonar Reasoning Pro (analytical)")
result2 = research.lookup(query2)
print(f"Actual model: {result2.get('model')}")
print()
def example_manual_override():
"""Demonstrate manual model override."""
print("=" * 80)
print("EXAMPLE 2: Manual Model Override")
print("=" * 80)
print()
# Force Sonar Pro Search for budget-constrained rapid lookup
research_pro = ResearchLookup(force_model='pro')
query = "Explain the mechanism of CRISPR-Cas9"
print(f"Query: {query}")
print(f"Forced model: Sonar Pro Search")
result = research_pro.lookup(query)
print(f"Model used: {result.get('model')}")
print()
# Force Sonar Reasoning Pro for critical analysis
research_reasoning = ResearchLookup(force_model='reasoning')
print(f"Query: {query}")
print(f"Forced model: Sonar Reasoning Pro")
result = research_reasoning.lookup(query)
print(f"Model used: {result.get('model')}")
print()
def example_batch_queries():
"""Demonstrate batch query processing."""
print("=" * 80)
print("EXAMPLE 3: Batch Query Processing")
print("=" * 80)
print()
research = ResearchLookup()
# Mix of simple and complex queries
queries = [
"Recent clinical trials for Alzheimer's disease", # Sonar Pro Search
"Compare deep learning vs traditional ML in drug discovery", # Sonar Reasoning Pro
"Statistical power analysis methods", # Sonar Pro Search
]
print("Processing batch queries...")
print("Each query will automatically select the appropriate model")
print()
results = research.batch_lookup(queries, delay=1.0)
for i, result in enumerate(results):
print(f"Query {i+1}: {result['query'][:50]}...")
print(f" Model: {result.get('model')}")
print(f" Type: {result.get('model_type')}")
print()
def example_scientific_writing_workflow():
"""Demonstrate integration with scientific writing workflow."""
print("=" * 80)
print("EXAMPLE 4: Scientific Writing Workflow")
print("=" * 80)
print()
research = ResearchLookup()
# Literature review phase - use Pro for breadth
print("PHASE 1: Literature Review (Breadth)")
lit_queries = [
"Recent papers on machine learning in genomics 2024",
"Clinical applications of AI in radiology",
"RNA sequencing analysis methods"
]
for query in lit_queries:
print(f" - {query}")
# These will automatically use Sonar Pro Search
print()
# Discussion phase - use Reasoning Pro for synthesis
print("PHASE 2: Discussion (Synthesis & Analysis)")
discussion_queries = [
"Compare the advantages and limitations of different ML approaches in genomics",
"Explain the relationship between model interpretability and clinical adoption",
"Analyze the ethical implications of AI in medical diagnosis"
]
for query in discussion_queries:
print(f" - {query}")
# These will automatically use Sonar Reasoning Pro
print()
def main():
"""Run all examples (requires OPENROUTER_API_KEY to be set)."""
if not os.getenv("OPENROUTER_API_KEY"):
print("Note: Set OPENROUTER_API_KEY environment variable to run live queries")
print("These examples show the structure without making actual API calls")
print()
# Uncomment to run examples (requires API key)
# example_automatic_selection()
# example_manual_override()
# example_batch_queries()
# example_scientific_writing_workflow()
# Show complexity assessment without API calls
print("=" * 80)
print("COMPLEXITY ASSESSMENT EXAMPLES (No API calls required)")
print("=" * 80)
print()
os.environ.setdefault("OPENROUTER_API_KEY", "test")
research = ResearchLookup()
test_queries = [
("Recent CRISPR studies", "pro"),
("Compare CRISPR vs TALENs", "reasoning"),
("Explain how CRISPR works", "reasoning"),
("Western blot protocol", "pro"),
("Pros and cons of different sequencing methods", "reasoning"),
]
for query, expected in test_queries:
complexity = research._assess_query_complexity(query)
model_name = "Sonar Reasoning Pro" if complexity == "reasoning" else "Sonar Pro Search"
status = "" if complexity == expected else ""
print(f"{status} '{query}'")
print(f"{model_name}")
print()
if __name__ == "__main__":
main()

View File

@@ -1,187 +0,0 @@
#!/usr/bin/env python3
"""
Research Lookup Tool for Claude Code
Performs research queries using Perplexity Sonar Pro Search via OpenRouter.
"""
import os
import sys
import json
from typing import Dict, List, Optional
# Import the main research lookup class
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'scripts'))
from research_lookup import ResearchLookup
def format_response(result: Dict) -> str:
"""Format the research result for display."""
if not result["success"]:
return f"❌ Research lookup failed: {result['error']}"
response = result["response"]
citations = result["citations"]
sources = result.get("sources", [])
# Format the output for Claude Code
output = f"""🔍 **Research Results**
**Query:** {result['query']}
**Model:** {result['model']}
**Timestamp:** {result['timestamp']}
**Note:** Results prioritized by citation count, venue prestige, and author reputation
---
{response}
"""
# Display API-provided sources with venue/citation info
if sources:
output += f"\n📚 **Sources ({len(sources)}):**\n"
output += "_Prioritized by venue quality and citation impact_\n\n"
for i, source in enumerate(sources, 1):
title = source.get("title", "Untitled")
url = source.get("url", "")
date = source.get("date", "")
snippet = source.get("snippet", "")
# Format source entry with available metadata
date_str = f" ({date})" if date else ""
output += f"{i}. **{title}**{date_str}\n"
# Add venue indicator if detectable from URL
venue_indicator = _detect_venue_tier(url)
if venue_indicator:
output += f" 📊 Venue: {venue_indicator}\n"
if url:
output += f" 🔗 {url}\n"
if snippet:
output += f" _{snippet[:150]}{'...' if len(snippet) > 150 else ''}_\n"
output += "\n"
# Display extracted citations (DOIs, etc.)
if citations:
doi_citations = [c for c in citations if c.get("type") == "doi"]
url_citations = [c for c in citations if c.get("type") == "url"]
if doi_citations:
output += f"\n🔗 **DOI References ({len(doi_citations)}):**\n"
for i, citation in enumerate(doi_citations, 1):
output += f"{i}. DOI: {citation.get('doi', '')}{citation.get('url', '')}\n"
if url_citations:
output += f"\n🌐 **Additional URLs ({len(url_citations)}):**\n"
for i, citation in enumerate(url_citations, 1):
url = citation.get('url', '')
venue = _detect_venue_tier(url)
venue_str = f" [{venue}]" if venue else ""
output += f"{i}. {url}{venue_str}\n"
if result.get("usage"):
usage = result["usage"]
output += f"\n**Usage:** {usage.get('total_tokens', 'N/A')} tokens"
return output
def _detect_venue_tier(url: str) -> Optional[str]:
"""Detect venue tier from URL to indicate source quality."""
if not url:
return None
url_lower = url.lower()
# Tier 1 - Premier venues
tier1_indicators = {
"nature.com": "Nature (Tier 1)",
"science.org": "Science (Tier 1)",
"cell.com": "Cell Press (Tier 1)",
"nejm.org": "NEJM (Tier 1)",
"thelancet.com": "Lancet (Tier 1)",
"jamanetwork.com": "JAMA (Tier 1)",
"pnas.org": "PNAS (Tier 1)",
}
# Tier 2 - High-impact specialized
tier2_indicators = {
"neurips.cc": "NeurIPS (Tier 2 - Top ML)",
"icml.cc": "ICML (Tier 2 - Top ML)",
"openreview.net": "Top ML Conference (Tier 2)",
"aacrjournals.org": "AACR Journals (Tier 2)",
"ahajournals.org": "AHA Journals (Tier 2)",
"bloodjournal.org": "Blood (Tier 2)",
"jci.org": "JCI (Tier 2)",
}
# Tier 3 - Respected academic sources
tier3_indicators = {
"springer.com": "Springer",
"wiley.com": "Wiley",
"elsevier.com": "Elsevier",
"oup.com": "Oxford University Press",
"arxiv.org": "arXiv (Preprint)",
"biorxiv.org": "bioRxiv (Preprint)",
"medrxiv.org": "medRxiv (Preprint)",
"pubmed": "PubMed",
"ncbi.nlm.nih.gov": "NCBI/PubMed",
"ieee.org": "IEEE",
"acm.org": "ACM",
}
for domain, label in tier1_indicators.items():
if domain in url_lower:
return label
for domain, label in tier2_indicators.items():
if domain in url_lower:
return label
for domain, label in tier3_indicators.items():
if domain in url_lower:
return label
return None
def main():
"""Main entry point for Claude Code tool."""
# Check for API key
if not os.getenv("OPENROUTER_API_KEY"):
print("❌ Error: OPENROUTER_API_KEY environment variable not set")
print("Please set it in your .env file or export it:")
print(" export OPENROUTER_API_KEY='your_openrouter_api_key'")
return 1
# Get query from command line arguments
if len(sys.argv) < 2:
print("❌ Error: No query provided")
print("Usage: python lookup.py 'your research query here'")
return 1
query = " ".join(sys.argv[1:])
try:
# Initialize research tool
research = ResearchLookup()
# Perform lookup
print(f"🔍 Researching: {query}")
result = research.lookup(query)
# Format and output result
formatted_output = format_response(result)
print(formatted_output)
# Return success code
return 0 if result["success"] else 1
except Exception as e:
print(f"❌ Error: {str(e)}")
return 1
if __name__ == "__main__":
exit(main())

View File

@@ -1,208 +1,269 @@
#!/usr/bin/env python3
"""
Research Information Lookup Tool
Uses Perplexity's Sonar Pro Search model through OpenRouter for academic research queries.
Routes research queries to the best backend:
- Parallel Chat API (core model): Default for all general research queries
- Perplexity sonar-pro-search (via OpenRouter): Academic-specific paper searches
Environment variables:
PARALLEL_API_KEY - Required for Parallel Chat API (primary backend)
OPENROUTER_API_KEY - Required for Perplexity academic searches (fallback)
"""
import os
import sys
import json
import requests
import re
import time
import requests
from datetime import datetime
from typing import Dict, List, Optional, Any
from urllib.parse import quote
from typing import Any, Dict, List, Optional
class ResearchLookup:
"""Research information lookup using Perplexity Sonar models via OpenRouter."""
"""Research information lookup with intelligent backend routing.
# Available models
MODELS = {
"pro": "perplexity/sonar-pro", # Fast lookup, cost-effective
"reasoning": "perplexity/sonar-reasoning-pro", # Deep analysis with reasoning
}
Routes queries to the Parallel Chat API (default) or Perplexity
sonar-pro-search (academic paper searches only).
"""
# Keywords that indicate complex queries requiring reasoning model
REASONING_KEYWORDS = [
"compare", "contrast", "analyze", "analysis", "evaluate", "critique",
"versus", "vs", "vs.", "compared to", "differences between", "similarities",
"meta-analysis", "systematic review", "synthesis", "integrate",
"mechanism", "why", "how does", "how do", "explain", "relationship",
"theoretical framework", "implications", "interpret", "reasoning",
"controversy", "conflicting", "paradox", "debate", "reconcile",
"pros and cons", "advantages and disadvantages", "trade-off", "tradeoff",
ACADEMIC_KEYWORDS = [
"find papers", "find paper", "find articles", "find article",
"cite ", "citation", "citations for",
"doi ", "doi:", "pubmed", "pmid",
"journal article", "peer-reviewed",
"systematic review", "meta-analysis",
"literature search", "literature on",
"academic papers", "academic paper",
"research papers on", "research paper on",
"published studies", "published study",
"scholarly", "scholar",
"arxiv", "preprint",
"foundational papers", "seminal papers", "landmark papers",
"highly cited", "most cited",
]
def __init__(self, force_model: Optional[str] = None):
"""
Initialize the research lookup tool.
Args:
force_model: Optional model override ('pro' or 'reasoning').
If None, model is auto-selected based on query complexity.
"""
self.api_key = os.getenv("OPENROUTER_API_KEY")
if not self.api_key:
raise ValueError("OPENROUTER_API_KEY environment variable not set")
PARALLEL_SYSTEM_PROMPT = (
"You are a deep research analyst. Provide a comprehensive, well-cited "
"research report on the user's topic. Include:\n"
"- Key findings with specific data, statistics, and quantitative evidence\n"
"- Detailed analysis organized by themes\n"
"- Multiple authoritative sources cited inline\n"
"- Methodologies and implications where relevant\n"
"- Future outlook and research gaps\n"
"Use markdown formatting with clear section headers. "
"Prioritize authoritative and recent sources."
)
self.base_url = "https://openrouter.ai/api/v1"
self.force_model = force_model
self.headers = {
"Authorization": f"Bearer {self.api_key}",
CHAT_BASE_URL = "https://api.parallel.ai"
def __init__(self, force_backend: Optional[str] = None):
"""Initialize the research lookup tool.
Args:
force_backend: Force a specific backend ('parallel' or 'perplexity').
If None, backend is auto-selected based on query content.
"""
self.force_backend = force_backend
self.parallel_available = bool(os.getenv("PARALLEL_API_KEY"))
self.perplexity_available = bool(os.getenv("OPENROUTER_API_KEY"))
if not self.parallel_available and not self.perplexity_available:
raise ValueError(
"No API keys found. Set at least one of:\n"
" PARALLEL_API_KEY (for Parallel Chat API - primary)\n"
" OPENROUTER_API_KEY (for Perplexity academic search - fallback)"
)
def _select_backend(self, query: str) -> str:
"""Select the best backend for a query."""
if self.force_backend:
if self.force_backend == "perplexity" and self.perplexity_available:
return "perplexity"
if self.force_backend == "parallel" and self.parallel_available:
return "parallel"
query_lower = query.lower()
is_academic = any(kw in query_lower for kw in self.ACADEMIC_KEYWORDS)
if is_academic and self.perplexity_available:
return "perplexity"
if self.parallel_available:
return "parallel"
if self.perplexity_available:
return "perplexity"
raise ValueError("No backend available. Check API keys.")
# ------------------------------------------------------------------
# Parallel Chat API backend
# ------------------------------------------------------------------
def _get_chat_client(self):
"""Lazy-load and cache the OpenAI client for Parallel Chat API."""
if not hasattr(self, "_chat_client"):
try:
from openai import OpenAI
except ImportError:
raise ImportError(
"The 'openai' package is required for Parallel Chat API.\n"
"Install it with: pip install openai"
)
self._chat_client = OpenAI(
api_key=os.getenv("PARALLEL_API_KEY"),
base_url=self.CHAT_BASE_URL,
)
return self._chat_client
def _parallel_lookup(self, query: str) -> Dict[str, Any]:
"""Run research via the Parallel Chat API (core model)."""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
model = "core"
try:
client = self._get_chat_client()
print(f"[Research] Parallel Chat API (model={model})...", file=sys.stderr)
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": self.PARALLEL_SYSTEM_PROMPT},
{"role": "user", "content": query},
],
stream=False,
)
content = ""
if response.choices and len(response.choices) > 0:
content = response.choices[0].message.content or ""
api_citations = self._extract_basis_citations(response)
text_citations = self._extract_citations_from_text(content)
return {
"success": True,
"query": query,
"response": content,
"citations": api_citations + text_citations,
"sources": api_citations,
"timestamp": timestamp,
"backend": "parallel",
"model": f"parallel-chat/{model}",
}
except Exception as e:
return {
"success": False,
"query": query,
"error": str(e),
"timestamp": timestamp,
"backend": "parallel",
"model": f"parallel-chat/{model}",
}
def _extract_basis_citations(self, response) -> List[Dict[str, str]]:
"""Extract citation sources from the Chat API research basis."""
citations = []
basis = getattr(response, "basis", None)
if not basis:
return citations
seen_urls = set()
if isinstance(basis, list):
for item in basis:
cits = (
item.get("citations", []) if isinstance(item, dict)
else getattr(item, "citations", None) or []
)
for cit in cits:
url = cit.get("url", "") if isinstance(cit, dict) else getattr(cit, "url", "")
if url and url not in seen_urls:
seen_urls.add(url)
title = cit.get("title", "") if isinstance(cit, dict) else getattr(cit, "title", "")
excerpts = cit.get("excerpts", []) if isinstance(cit, dict) else getattr(cit, "excerpts", [])
citations.append({
"type": "source",
"url": url,
"title": title,
"excerpts": excerpts,
})
return citations
# ------------------------------------------------------------------
# Perplexity academic search backend
# ------------------------------------------------------------------
def _perplexity_lookup(self, query: str) -> Dict[str, Any]:
"""Run academic search via Perplexity sonar-pro-search through OpenRouter."""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
api_key = os.getenv("OPENROUTER_API_KEY")
model = "perplexity/sonar-pro-search"
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
"HTTP-Referer": "https://scientific-writer.local",
"X-Title": "Scientific Writer Research Tool"
"X-Title": "Scientific Writer Research Tool",
}
def _select_model(self, query: str) -> str:
"""
Select the appropriate model based on query complexity.
Args:
query: The research query
Returns:
Model identifier string
"""
if self.force_model:
return self.MODELS.get(self.force_model, self.MODELS["reasoning"])
# Check for reasoning keywords (case-insensitive)
query_lower = query.lower()
for keyword in self.REASONING_KEYWORDS:
if keyword in query_lower:
return self.MODELS["reasoning"]
# Check for multiple questions or complex structure
question_count = query.count("?")
if question_count >= 2:
return self.MODELS["reasoning"]
# Check for very long queries (likely complex)
if len(query) > 200:
return self.MODELS["reasoning"]
# Default to pro for simple lookups
return self.MODELS["pro"]
research_prompt = self._format_academic_prompt(query)
messages = [
{
"role": "system",
"content": (
"You are an academic research assistant specializing in finding "
"HIGH-IMPACT, INFLUENTIAL research.\n\n"
"QUALITY PRIORITIZATION (CRITICAL):\n"
"- ALWAYS prefer highly-cited papers over obscure publications\n"
"- ALWAYS prioritize Tier-1 venues: Nature, Science, Cell, NEJM, Lancet, JAMA, PNAS\n"
"- ALWAYS prefer papers from established researchers\n"
"- Include citation counts when known (e.g., 'cited 500+ times')\n"
"- Quality matters more than quantity\n\n"
"VENUE HIERARCHY:\n"
"1. Nature/Science/Cell family, NEJM, Lancet, JAMA (highest)\n"
"2. High-impact specialized journals (IF>10), top conferences (NeurIPS, ICML, ICLR)\n"
"3. Respected field-specific journals (IF 5-10)\n"
"4. Other peer-reviewed sources (only if no better option)\n\n"
"Focus exclusively on scholarly sources. Prioritize recent literature (2020-2026) "
"and provide complete citations with DOIs."
),
},
{"role": "user", "content": research_prompt},
]
def _make_request(self, messages: List[Dict[str, str]], model: str, **kwargs) -> Dict[str, Any]:
"""Make a request to the OpenRouter API with academic search mode."""
data = {
"model": model,
"messages": messages,
"max_tokens": 8000,
"temperature": 0.1, # Low temperature for factual research
# Perplexity-specific parameters for academic search
"search_mode": "academic", # Prioritize scholarly sources (peer-reviewed papers, journals)
"search_context_size": "high", # Always use high context for deeper research
**kwargs
"temperature": 0.1,
"search_mode": "academic",
"search_context_size": "high",
}
try:
response = requests.post(
f"{self.base_url}/chat/completions",
headers=self.headers,
"https://openrouter.ai/api/v1/chat/completions",
headers=headers,
json=data,
timeout=90 # Increased timeout for academic search
timeout=90,
)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
raise Exception(f"API request failed: {str(e)}")
resp_json = response.json()
def _format_research_prompt(self, query: str) -> str:
"""Format the query for optimal research results."""
return f"""You are an expert research assistant. Please provide comprehensive, accurate research information for the following query: "{query}"
IMPORTANT INSTRUCTIONS:
1. Focus on ACADEMIC and SCIENTIFIC sources (peer-reviewed papers, reputable journals, institutional research)
2. Include RECENT information (prioritize 2020-2026 publications)
3. Provide COMPLETE citations with authors, title, journal/conference, year, and DOI when available
4. Structure your response with clear sections and proper attribution
5. Be comprehensive but concise - aim for 800-1200 words
6. Include key findings, methodologies, and implications when relevant
7. Note any controversies, limitations, or conflicting evidence
PAPER QUALITY AND POPULARITY PRIORITIZATION (CRITICAL):
8. ALWAYS prioritize HIGHLY-CITED papers over obscure publications:
- Recent papers (0-3 years): prefer 20+ citations, highlight 100+ as highly influential
- Mid-age papers (3-7 years): prefer 100+ citations, highlight 500+ as landmark
- Older papers (7+ years): prefer 500+ citations, highlight 1000+ as foundational
9. ALWAYS prioritize papers from TOP-TIER VENUES:
- Tier 1 (highest priority): Nature, Science, Cell, NEJM, Lancet, JAMA, PNAS, Nature Medicine, Nature Biotechnology
- Tier 2 (high priority): High-impact specialized journals (IF>10), top conferences (NeurIPS, ICML, ICLR for AI/ML)
- Tier 3: Respected specialized journals (IF 5-10)
- Only cite lower-tier venues if directly relevant AND no better source exists
10. PREFER papers from ESTABLISHED, REPUTABLE AUTHORS:
- Senior researchers with high h-index and multiple high-impact publications
- Leading research groups at recognized institutions
- Authors with recognized expertise (awards, editorial positions)
11. For EACH citation, include when available:
- Approximate citation count (e.g., "cited 500+ times")
- Journal/venue tier indicator
- Notable author credentials if relevant
12. PRIORITIZE papers that DIRECTLY address the research question over tangentially related work
RESPONSE FORMAT:
- Start with a brief summary (2-3 sentences)
- Present key findings and studies in organized sections
- Rank papers by impact: most influential/cited first
- End with future directions or research gaps if applicable
- Include 5-8 high-quality citations, emphasizing Tier-1 venues and highly-cited papers
Remember: Quality over quantity. Prioritize influential, highly-cited papers from prestigious venues and established researchers."""
def lookup(self, query: str) -> Dict[str, Any]:
"""Perform a research lookup for the given query."""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Select model based on query complexity
model = self._select_model(query)
# Format the research prompt
research_prompt = self._format_research_prompt(query)
# Prepare messages for the API with system message for academic mode
messages = [
{
"role": "system",
"content": """You are an academic research assistant specializing in finding HIGH-IMPACT, INFLUENTIAL research.
QUALITY PRIORITIZATION (CRITICAL):
- ALWAYS prefer highly-cited papers over obscure publications
- ALWAYS prioritize Tier-1 venues: Nature, Science, Cell, NEJM, Lancet, JAMA, PNAS, and their family journals
- ALWAYS prefer papers from established researchers with strong publication records
- Include citation counts when known (e.g., "cited 500+ times")
- Quality matters more than quantity - 5 excellent papers beats 10 mediocre ones
VENUE HIERARCHY:
1. Nature/Science/Cell family, NEJM, Lancet, JAMA (highest priority)
2. High-impact specialized journals (IF>10), top ML conferences (NeurIPS, ICML, ICLR)
3. Respected field-specific journals (IF 5-10)
4. Other peer-reviewed sources (only if no better option exists)
Focus exclusively on scholarly sources: peer-reviewed journals, academic papers, research institutions. Prioritize recent academic literature (2020-2026) and provide complete citations with DOIs. Always indicate paper impact through citation counts and venue prestige."""
},
{"role": "user", "content": research_prompt}
]
try:
# Make the API request
response = self._make_request(messages, model)
# Extract the response content
if "choices" in response and len(response["choices"]) > 0:
choice = response["choices"][0]
if "choices" in resp_json and len(resp_json["choices"]) > 0:
choice = resp_json["choices"][0]
if "message" in choice and "content" in choice["message"]:
content = choice["message"]["content"]
# Extract citations from API response (Perplexity provides these)
api_citations = self._extract_api_citations(response, choice)
# Also extract citations from text as fallback
api_citations = self._extract_api_citations(resp_json, choice)
text_citations = self._extract_citations_from_text(content)
# Combine: prioritize API citations, add text citations if no duplicates
citations = api_citations + text_citations
return {
@@ -210,10 +271,11 @@ Focus exclusively on scholarly sources: peer-reviewed journals, academic papers,
"query": query,
"response": content,
"citations": citations,
"sources": api_citations, # Separate field for API-provided sources
"sources": api_citations,
"timestamp": timestamp,
"backend": "perplexity",
"model": model,
"usage": response.get("usage", {})
"usage": resp_json.get("usage", {}),
}
else:
raise Exception("Invalid response format from API")
@@ -226,22 +288,54 @@ Focus exclusively on scholarly sources: peer-reviewed journals, academic papers,
"query": query,
"error": str(e),
"timestamp": timestamp,
"model": model
"backend": "perplexity",
"model": model,
}
# ------------------------------------------------------------------
# Shared utilities
# ------------------------------------------------------------------
def _format_academic_prompt(self, query: str) -> str:
"""Format a query for academic research results via Perplexity."""
return f"""You are an expert research assistant. Please provide comprehensive, accurate research information for the following query: "{query}"
IMPORTANT INSTRUCTIONS:
1. Focus on ACADEMIC and SCIENTIFIC sources (peer-reviewed papers, reputable journals, institutional research)
2. Include RECENT information (prioritize 2020-2026 publications)
3. Provide COMPLETE citations with authors, title, journal/conference, year, and DOI when available
4. Structure your response with clear sections and proper attribution
5. Be comprehensive but concise - aim for 800-1200 words
6. Include key findings, methodologies, and implications when relevant
7. Note any controversies, limitations, or conflicting evidence
PAPER QUALITY PRIORITIZATION (CRITICAL):
8. ALWAYS prioritize HIGHLY-CITED papers over obscure publications
9. ALWAYS prioritize papers from TOP-TIER VENUES (Nature, Science, Cell, NEJM, Lancet, JAMA, PNAS)
10. PREFER papers from ESTABLISHED, REPUTABLE AUTHORS
11. For EACH citation include when available: citation count, venue tier, author credentials
12. PRIORITIZE papers that DIRECTLY address the research question
RESPONSE FORMAT:
- Start with a brief summary (2-3 sentences)
- Present key findings and studies in organized sections
- Rank papers by impact: most influential/cited first
- End with future directions or research gaps if applicable
- Include 5-8 high-quality citations
Remember: Quality over quantity. Prioritize influential, highly-cited papers from prestigious venues."""
def _extract_api_citations(self, response: Dict[str, Any], choice: Dict[str, Any]) -> List[Dict[str, str]]:
"""Extract citations from Perplexity API response fields."""
citations = []
# Perplexity returns citations in search_results field (new format)
# Check multiple possible locations where OpenRouter might place them
search_results = (
response.get("search_results") or
choice.get("search_results") or
choice.get("message", {}).get("search_results") or
[]
response.get("search_results")
or choice.get("search_results")
or choice.get("message", {}).get("search_results")
or []
)
for result in search_results:
citation = {
"type": "source",
@@ -249,162 +343,164 @@ Focus exclusively on scholarly sources: peer-reviewed journals, academic papers,
"url": result.get("url", ""),
"date": result.get("date", ""),
}
# Add snippet if available (newer API feature)
if result.get("snippet"):
citation["snippet"] = result.get("snippet")
citation["snippet"] = result["snippet"]
citations.append(citation)
# Also check for legacy citations field (backward compatibility)
legacy_citations = (
response.get("citations") or
choice.get("citations") or
choice.get("message", {}).get("citations") or
[]
response.get("citations")
or choice.get("citations")
or choice.get("message", {}).get("citations")
or []
)
for url in legacy_citations:
if isinstance(url, str):
# Legacy format was just URLs
citations.append({
"type": "source",
"url": url,
"title": "",
"date": ""
})
citations.append({"type": "source", "url": url, "title": "", "date": ""})
elif isinstance(url, dict):
citations.append({
"type": "source",
"url": url.get("url", ""),
"title": url.get("title", ""),
"date": url.get("date", "")
"date": url.get("date", ""),
})
return citations
def _extract_citations_from_text(self, text: str) -> List[Dict[str, str]]:
"""Extract potential citations from the response text as fallback."""
import re
"""Extract DOIs and academic URLs from response text as fallback."""
citations = []
# Look for DOI patterns first (most reliable)
# Matches: doi:10.xxx, DOI: 10.xxx, https://doi.org/10.xxx
doi_pattern = r'(?:doi[:\s]*|https?://(?:dx\.)?doi\.org/)(10\.[0-9]{4,}/[^\s\)\]\,\[\<\>]+)'
doi_matches = re.findall(doi_pattern, text, re.IGNORECASE)
seen_dois = set()
for doi in doi_matches:
# Clean up DOI - remove trailing punctuation and brackets
doi_clean = doi.strip().rstrip('.,;:)]')
doi_clean = doi.strip().rstrip(".,;:)]")
if doi_clean and doi_clean not in seen_dois:
seen_dois.add(doi_clean)
citations.append({
"type": "doi",
"doi": doi_clean,
"url": f"https://doi.org/{doi_clean}"
"url": f"https://doi.org/{doi_clean}",
})
# Look for URLs that might be sources
url_pattern = r'https?://[^\s\)\]\,\<\>\"\']+(?:arxiv\.org|pubmed|ncbi\.nlm\.nih\.gov|nature\.com|science\.org|wiley\.com|springer\.com|ieee\.org|acm\.org)[^\s\)\]\,\<\>\"\']*'
url_pattern = (
r'https?://[^\s\)\]\,\<\>\"\']+(?:arxiv\.org|pubmed|ncbi\.nlm\.nih\.gov|'
r'nature\.com|science\.org|wiley\.com|springer\.com|ieee\.org|acm\.org)'
r'[^\s\)\]\,\<\>\"\']*'
)
url_matches = re.findall(url_pattern, text, re.IGNORECASE)
seen_urls = set()
for url in url_matches:
url_clean = url.rstrip('.')
url_clean = url.rstrip(".")
if url_clean not in seen_urls:
seen_urls.add(url_clean)
citations.append({
"type": "url",
"url": url_clean
})
citations.append({"type": "url", "url": url_clean})
return citations
def batch_lookup(self, queries: List[str], delay: float = 1.0) -> List[Dict[str, Any]]:
"""Perform multiple research lookups with optional delay between requests."""
results = []
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
def lookup(self, query: str) -> Dict[str, Any]:
"""Perform a research lookup, routing to the best backend.
Parallel Chat API is used by default. Perplexity sonar-pro-search
is used only for academic-specific queries (paper searches, DOI lookups).
"""
backend = self._select_backend(query)
print(f"[Research] Backend: {backend} | Query: {query[:80]}...", file=sys.stderr)
if backend == "parallel":
return self._parallel_lookup(query)
else:
return self._perplexity_lookup(query)
def batch_lookup(self, queries: List[str], delay: float = 1.0) -> List[Dict[str, Any]]:
"""Perform multiple research lookups with delay between requests."""
results = []
for i, query in enumerate(queries):
if i > 0 and delay > 0:
time.sleep(delay) # Rate limiting
time.sleep(delay)
result = self.lookup(query)
results.append(result)
# Print progress
print(f"[Research] Completed query {i+1}/{len(queries)}: {query[:50]}...")
print(f"[Research] Completed query {i+1}/{len(queries)}: {query[:50]}...", file=sys.stderr)
return results
def get_model_info(self) -> Dict[str, Any]:
"""Get information about available models from OpenRouter."""
try:
response = requests.get(
f"{self.base_url}/models",
headers=self.headers,
timeout=30
)
response.raise_for_status()
return response.json()
except Exception as e:
return {"error": str(e)}
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main():
"""Command-line interface for testing the research lookup tool."""
"""Command-line interface for the research lookup tool."""
import argparse
import sys
parser = argparse.ArgumentParser(description="Research Information Lookup Tool")
parser = argparse.ArgumentParser(
description="Research Information Lookup Tool (Parallel Chat API + Perplexity)",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# General research (uses Parallel Chat API, core model)
python research_lookup.py "latest advances in quantum computing 2025"
# Academic paper search (auto-routes to Perplexity)
python research_lookup.py "find papers on CRISPR gene editing clinical trials"
# Force a specific backend
python research_lookup.py "topic" --force-backend parallel
python research_lookup.py "topic" --force-backend perplexity
# Save output to file
python research_lookup.py "topic" -o results.txt
# JSON output
python research_lookup.py "topic" --json -o results.json
""",
)
parser.add_argument("query", nargs="?", help="Research query to look up")
parser.add_argument("--model-info", action="store_true", help="Show available models")
parser.add_argument("--batch", nargs="+", help="Run multiple queries")
parser.add_argument("--force-model", choices=["pro", "reasoning"],
help="Force specific model: 'pro' for fast lookup, 'reasoning' for deep analysis")
parser.add_argument("-o", "--output", help="Write output to file instead of stdout")
parser.add_argument("--json", action="store_true", help="Output results as JSON")
parser.add_argument(
"--force-backend",
choices=["parallel", "perplexity"],
help="Force a specific backend (default: auto-select)",
)
parser.add_argument("-o", "--output", help="Write output to file")
parser.add_argument("--json", action="store_true", help="Output as JSON")
args = parser.parse_args()
# Set up output destination
output_file = None
if args.output:
output_file = open(args.output, 'w', encoding='utf-8')
output_file = open(args.output, "w", encoding="utf-8")
def write_output(text):
"""Write to file or stdout."""
if output_file:
output_file.write(text + '\n')
output_file.write(text + "\n")
else:
print(text)
# Check for API key
if not os.getenv("OPENROUTER_API_KEY"):
print("Error: OPENROUTER_API_KEY environment variable not set", file=sys.stderr)
print("Please set it in your .env file or export it:", file=sys.stderr)
print(" export OPENROUTER_API_KEY='your_openrouter_api_key'", file=sys.stderr)
has_parallel = bool(os.getenv("PARALLEL_API_KEY"))
has_perplexity = bool(os.getenv("OPENROUTER_API_KEY"))
if not has_parallel and not has_perplexity:
print("Error: No API keys found. Set at least one:", file=sys.stderr)
print(" export PARALLEL_API_KEY='...' (primary - Parallel Chat API)", file=sys.stderr)
print(" export OPENROUTER_API_KEY='...' (fallback - Perplexity academic)", file=sys.stderr)
if output_file:
output_file.close()
return 1
if not args.query and not args.batch:
parser.print_help()
if output_file:
output_file.close()
return 1
try:
research = ResearchLookup(force_model=args.force_model)
if args.model_info:
write_output("Available models from OpenRouter:")
models = research.get_model_info()
if "data" in models:
for model in models["data"]:
if "perplexity" in model["id"].lower():
write_output(f" - {model['id']}: {model.get('name', 'N/A')}")
if output_file:
output_file.close()
return 0
if not args.query and not args.batch:
print("Error: No query provided. Use --model-info to see available models.", file=sys.stderr)
if output_file:
output_file.close()
return 1
research = ResearchLookup(force_backend=args.force_backend)
if args.batch:
print(f"Running batch research for {len(args.batch)} queries...", file=sys.stderr)
@@ -413,27 +509,24 @@ def main():
print(f"Researching: {args.query}", file=sys.stderr)
results = [research.lookup(args.query)]
# Output as JSON if requested
if args.json:
write_output(json.dumps(results, indent=2, ensure_ascii=False))
write_output(json.dumps(results, indent=2, ensure_ascii=False, default=str))
if output_file:
output_file.close()
return 0
# Display results in human-readable format
for i, result in enumerate(results):
if result["success"]:
write_output(f"\n{'='*80}")
write_output(f"Query {i+1}: {result['query']}")
write_output(f"Timestamp: {result['timestamp']}")
write_output(f"Model: {result['model']}")
write_output(f"Backend: {result.get('backend', 'unknown')} | Model: {result.get('model', 'unknown')}")
write_output(f"{'='*80}")
write_output(result["response"])
# Display API-provided sources first (most reliable)
sources = result.get("sources", [])
if sources:
write_output(f"\n📚 Sources ({len(sources)}):")
write_output(f"\nSources ({len(sources)}):")
for j, source in enumerate(sources):
title = source.get("title", "Untitled")
url = source.get("url", "")
@@ -443,11 +536,10 @@ def main():
if url:
write_output(f" {url}")
# Display additional text-extracted citations
citations = result.get("citations", [])
text_citations = [c for c in citations if c.get("type") in ("doi", "url")]
if text_citations:
write_output(f"\n🔗 Additional References ({len(text_citations)}):")
write_output(f"\nAdditional References ({len(text_citations)}):")
for j, citation in enumerate(text_citations):
if citation.get("type") == "doi":
write_output(f" [{j+1}] DOI: {citation.get('doi', '')} - {citation.get('url', '')}")
@@ -464,11 +556,11 @@ def main():
return 0
except Exception as e:
print(f"Error: {str(e)}", file=sys.stderr)
print(f"Error: {e}", file=sys.stderr)
if output_file:
output_file.close()
return 1
if __name__ == "__main__":
exit(main())
sys.exit(main())