claude-scientific-skills/scientific-skills/research-lookup/scripts/research_lookup.py

#!/usr/bin/env python3
"""
Research Information Lookup Tool
Uses Perplexity's Sonar Pro Search model through OpenRouter for academic research queries.
"""

import os
import json
import requests
import time
from datetime import datetime
from typing import Dict, List, Optional, Any
from urllib.parse import quote


class ResearchLookup:
    """Research information lookup using Perplexity Sonar models via OpenRouter."""

    # Available models
    MODELS = {
        "pro": "perplexity/sonar-pro",  # Fast lookup, cost-effective
        "reasoning": "perplexity/sonar-reasoning-pro",  # Deep analysis with reasoning
    }

    # Keywords that indicate complex queries requiring reasoning model
    REASONING_KEYWORDS = [
        "compare", "contrast", "analyze", "analysis", "evaluate", "critique",
        "versus", "vs", "vs.", "compared to", "differences between", "similarities",
        "meta-analysis", "systematic review", "synthesis", "integrate",
        "mechanism", "why", "how does", "how do", "explain", "relationship",
        "theoretical framework", "implications", "interpret", "reasoning",
        "controversy", "conflicting", "paradox", "debate", "reconcile",
        "pros and cons", "advantages and disadvantages", "trade-off", "tradeoff",
    ]

    def __init__(self, force_model: Optional[str] = None):
        """
        Initialize the research lookup tool.

        Args:
            force_model: Optional model override ('pro' or 'reasoning').
                        If None, model is auto-selected based on query complexity.
        """
        self.api_key = os.getenv("OPENROUTER_API_KEY")
        if not self.api_key:
            raise ValueError("OPENROUTER_API_KEY environment variable not set")

        self.base_url = "https://openrouter.ai/api/v1"
        self.force_model = force_model
        self.headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
            "HTTP-Referer": "https://scientific-writer.local",
            "X-Title": "Scientific Writer Research Tool"
        }

    def _select_model(self, query: str) -> str:
        """
        Select the appropriate model based on query complexity.

        Args:
            query: The research query

        Returns:
            Model identifier string
        """
        if self.force_model:
            return self.MODELS.get(self.force_model, self.MODELS["reasoning"])

        # Check for reasoning keywords (case-insensitive)
        query_lower = query.lower()
        for keyword in self.REASONING_KEYWORDS:
            if keyword in query_lower:
                return self.MODELS["reasoning"]

        # Check for multiple questions or complex structure
        question_count = query.count("?")
        if question_count >= 2:
            return self.MODELS["reasoning"]

        # Check for very long queries (likely complex)
        if len(query) > 200:
            return self.MODELS["reasoning"]

        # Default to pro for simple lookups
        return self.MODELS["pro"]

    def _make_request(self, messages: List[Dict[str, str]], model: str, **kwargs) -> Dict[str, Any]:
        """Make a request to the OpenRouter API with academic search mode."""
        data = {
            "model": model,
            "messages": messages,
            "max_tokens": 8000,
            "temperature": 0.1,  # Low temperature for factual research
            # Perplexity-specific parameters for academic search
            "search_mode": "academic",  # Prioritize scholarly sources (peer-reviewed papers, journals)
            "search_context_size": "high",  # Always use high context for deeper research
            **kwargs
        }

        try:
            response = requests.post(
                f"{self.base_url}/chat/completions",
                headers=self.headers,
                json=data,
                timeout=90  # Increased timeout for academic search
            )
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            raise Exception(f"API request failed: {str(e)}")

    def _format_research_prompt(self, query: str) -> str:
        """Format the query for optimal research results."""
        return f"""You are an expert research assistant. Please provide comprehensive, accurate research information for the following query: "{query}"

IMPORTANT INSTRUCTIONS:
1. Focus on ACADEMIC and SCIENTIFIC sources (peer-reviewed papers, reputable journals, institutional research)
2. Include RECENT information (prioritize 2020-2026 publications)
3. Provide COMPLETE citations with authors, title, journal/conference, year, and DOI when available
4. Structure your response with clear sections and proper attribution
5. Be comprehensive but concise - aim for 800-1200 words
6. Include key findings, methodologies, and implications when relevant
7. Note any controversies, limitations, or conflicting evidence

PAPER QUALITY AND POPULARITY PRIORITIZATION (CRITICAL):
8. ALWAYS prioritize HIGHLY-CITED papers over obscure publications:
   - Recent papers (0-3 years): prefer 20+ citations, highlight 100+ as highly influential
   - Mid-age papers (3-7 years): prefer 100+ citations, highlight 500+ as landmark
   - Older papers (7+ years): prefer 500+ citations, highlight 1000+ as foundational
9. ALWAYS prioritize papers from TOP-TIER VENUES:
   - Tier 1 (highest priority): Nature, Science, Cell, NEJM, Lancet, JAMA, PNAS, Nature Medicine, Nature Biotechnology
   - Tier 2 (high priority): High-impact specialized journals (IF>10), top conferences (NeurIPS, ICML, ICLR for AI/ML)
   - Tier 3: Respected specialized journals (IF 5-10)
   - Only cite lower-tier venues if directly relevant AND no better source exists
10. PREFER papers from ESTABLISHED, REPUTABLE AUTHORS:
    - Senior researchers with high h-index and multiple high-impact publications
    - Leading research groups at recognized institutions
    - Authors with recognized expertise (awards, editorial positions)
11. For EACH citation, include when available:
    - Approximate citation count (e.g., "cited 500+ times")
    - Journal/venue tier indicator
    - Notable author credentials if relevant
12. PRIORITIZE papers that DIRECTLY address the research question over tangentially related work

RESPONSE FORMAT:
- Start with a brief summary (2-3 sentences)
- Present key findings and studies in organized sections
- Rank papers by impact: most influential/cited first
- End with future directions or research gaps if applicable
- Include 5-8 high-quality citations, emphasizing Tier-1 venues and highly-cited papers

Remember: Quality over quantity. Prioritize influential, highly-cited papers from prestigious venues and established researchers."""

    def lookup(self, query: str) -> Dict[str, Any]:
        """Perform a research lookup for the given query."""
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        # Select model based on query complexity
        model = self._select_model(query)

        # Format the research prompt
        research_prompt = self._format_research_prompt(query)

        # Prepare messages for the API with system message for academic mode
        messages = [
            {
                "role": "system",
                "content": """You are an academic research assistant specializing in finding HIGH-IMPACT, INFLUENTIAL research.

QUALITY PRIORITIZATION (CRITICAL):
- ALWAYS prefer highly-cited papers over obscure publications
- ALWAYS prioritize Tier-1 venues: Nature, Science, Cell, NEJM, Lancet, JAMA, PNAS, and their family journals
- ALWAYS prefer papers from established researchers with strong publication records
- Include citation counts when known (e.g., "cited 500+ times")
- Quality matters more than quantity - 5 excellent papers beats 10 mediocre ones

VENUE HIERARCHY:
1. Nature/Science/Cell family, NEJM, Lancet, JAMA (highest priority)
2. High-impact specialized journals (IF>10), top ML conferences (NeurIPS, ICML, ICLR)
3. Respected field-specific journals (IF 5-10)
4. Other peer-reviewed sources (only if no better option exists)

Focus exclusively on scholarly sources: peer-reviewed journals, academic papers, research institutions. Prioritize recent academic literature (2020-2026) and provide complete citations with DOIs. Always indicate paper impact through citation counts and venue prestige."""
            },
            {"role": "user", "content": research_prompt}
        ]

        try:
            # Make the API request
            response = self._make_request(messages, model)

            # Extract the response content
            if "choices" in response and len(response["choices"]) > 0:
                choice = response["choices"][0]
                if "message" in choice and "content" in choice["message"]:
                    content = choice["message"]["content"]

                    # Extract citations from API response (Perplexity provides these)
                    api_citations = self._extract_api_citations(response, choice)

                    # Also extract citations from text as fallback
                    text_citations = self._extract_citations_from_text(content)

                    # Combine: prioritize API citations, add text citations if no duplicates
                    citations = api_citations + text_citations

                    return {
                        "success": True,
                        "query": query,
                        "response": content,
                        "citations": citations,
                        "sources": api_citations,  # Separate field for API-provided sources
                        "timestamp": timestamp,
                        "model": model,
                        "usage": response.get("usage", {})
                    }
                else:
                    raise Exception("Invalid response format from API")
            else:
                raise Exception("No response choices received from API")

        except Exception as e:
            return {
                "success": False,
                "query": query,
                "error": str(e),
                "timestamp": timestamp,
                "model": model
            }

    def _extract_api_citations(self, response: Dict[str, Any], choice: Dict[str, Any]) -> List[Dict[str, str]]:
        """Extract citations from Perplexity API response fields."""
        citations = []

        # Perplexity returns citations in search_results field (new format)
        # Check multiple possible locations where OpenRouter might place them
        search_results = (
            response.get("search_results") or
            choice.get("search_results") or
            choice.get("message", {}).get("search_results") or
            []
        )

        for result in search_results:
            citation = {
                "type": "source",
                "title": result.get("title", ""),
                "url": result.get("url", ""),
                "date": result.get("date", ""),
            }
            # Add snippet if available (newer API feature)
            if result.get("snippet"):
                citation["snippet"] = result.get("snippet")
            citations.append(citation)

        # Also check for legacy citations field (backward compatibility)
        legacy_citations = (
            response.get("citations") or
            choice.get("citations") or
            choice.get("message", {}).get("citations") or
            []
        )

        for url in legacy_citations:
            if isinstance(url, str):
                # Legacy format was just URLs
                citations.append({
                    "type": "source",
                    "url": url,
                    "title": "",
                    "date": ""
                })
            elif isinstance(url, dict):
                citations.append({
                    "type": "source",
                    "url": url.get("url", ""),
                    "title": url.get("title", ""),
                    "date": url.get("date", "")
                })

        return citations

    def _extract_citations_from_text(self, text: str) -> List[Dict[str, str]]:
        """Extract potential citations from the response text as fallback."""
        import re
        citations = []

        # Look for DOI patterns first (most reliable)
        # Matches: doi:10.xxx, DOI: 10.xxx, https://doi.org/10.xxx
        doi_pattern = r'(?:doi[:\s]*|https?://(?:dx\.)?doi\.org/)(10\.[0-9]{4,}/[^\s\)\]\,\[\<\>]+)'
        doi_matches = re.findall(doi_pattern, text, re.IGNORECASE)
        seen_dois = set()

        for doi in doi_matches:
            # Clean up DOI - remove trailing punctuation and brackets
            doi_clean = doi.strip().rstrip('.,;:)]')
            if doi_clean and doi_clean not in seen_dois:
                seen_dois.add(doi_clean)
                citations.append({
                    "type": "doi",
                    "doi": doi_clean,
                    "url": f"https://doi.org/{doi_clean}"
                })

        # Look for URLs that might be sources
        url_pattern = r'https?://[^\s\)\]\,\<\>\"\']+(?:arxiv\.org|pubmed|ncbi\.nlm\.nih\.gov|nature\.com|science\.org|wiley\.com|springer\.com|ieee\.org|acm\.org)[^\s\)\]\,\<\>\"\']*'
        url_matches = re.findall(url_pattern, text, re.IGNORECASE)
        seen_urls = set()

        for url in url_matches:
            url_clean = url.rstrip('.')
            if url_clean not in seen_urls:
                seen_urls.add(url_clean)
                citations.append({
                    "type": "url",
                    "url": url_clean
                })

        return citations

    def batch_lookup(self, queries: List[str], delay: float = 1.0) -> List[Dict[str, Any]]:
        """Perform multiple research lookups with optional delay between requests."""
        results = []

        for i, query in enumerate(queries):
            if i > 0 and delay > 0:
                time.sleep(delay)  # Rate limiting

            result = self.lookup(query)
            results.append(result)

            # Print progress
            print(f"[Research] Completed query {i+1}/{len(queries)}: {query[:50]}...")

        return results

    def get_model_info(self) -> Dict[str, Any]:
        """Get information about available models from OpenRouter."""
        try:
            response = requests.get(
                f"{self.base_url}/models",
                headers=self.headers,
                timeout=30
            )
            response.raise_for_status()
            return response.json()
        except Exception as e:
            return {"error": str(e)}


def main():
    """Command-line interface for testing the research lookup tool."""
    import argparse
    import sys

    parser = argparse.ArgumentParser(description="Research Information Lookup Tool")
    parser.add_argument("query", nargs="?", help="Research query to look up")
    parser.add_argument("--model-info", action="store_true", help="Show available models")
    parser.add_argument("--batch", nargs="+", help="Run multiple queries")
    parser.add_argument("--force-model", choices=["pro", "reasoning"],
                        help="Force specific model: 'pro' for fast lookup, 'reasoning' for deep analysis")
    parser.add_argument("-o", "--output", help="Write output to file instead of stdout")
    parser.add_argument("--json", action="store_true", help="Output results as JSON")

    args = parser.parse_args()

    # Set up output destination
    output_file = None
    if args.output:
        output_file = open(args.output, 'w', encoding='utf-8')

    def write_output(text):
        """Write to file or stdout."""
        if output_file:
            output_file.write(text + '\n')
        else:
            print(text)

    # Check for API key
    if not os.getenv("OPENROUTER_API_KEY"):
        print("Error: OPENROUTER_API_KEY environment variable not set", file=sys.stderr)
        print("Please set it in your .env file or export it:", file=sys.stderr)
        print("  export OPENROUTER_API_KEY='your_openrouter_api_key'", file=sys.stderr)
        if output_file:
            output_file.close()
        return 1

    try:
        research = ResearchLookup(force_model=args.force_model)

        if args.model_info:
            write_output("Available models from OpenRouter:")
            models = research.get_model_info()
            if "data" in models:
                for model in models["data"]:
                    if "perplexity" in model["id"].lower():
                        write_output(f"  - {model['id']}: {model.get('name', 'N/A')}")
            if output_file:
                output_file.close()
            return 0

        if not args.query and not args.batch:
            print("Error: No query provided. Use --model-info to see available models.", file=sys.stderr)
            if output_file:
                output_file.close()
            return 1

        if args.batch:
            print(f"Running batch research for {len(args.batch)} queries...", file=sys.stderr)
            results = research.batch_lookup(args.batch)
        else:
            print(f"Researching: {args.query}", file=sys.stderr)
            results = [research.lookup(args.query)]

        # Output as JSON if requested
        if args.json:
            write_output(json.dumps(results, indent=2, ensure_ascii=False))
            if output_file:
                output_file.close()
            return 0

        # Display results in human-readable format
        for i, result in enumerate(results):
            if result["success"]:
                write_output(f"\n{'='*80}")
                write_output(f"Query {i+1}: {result['query']}")
                write_output(f"Timestamp: {result['timestamp']}")
                write_output(f"Model: {result['model']}")
                write_output(f"{'='*80}")
                write_output(result["response"])

                # Display API-provided sources first (most reliable)
                sources = result.get("sources", [])
                if sources:
                    write_output(f"\n📚 Sources ({len(sources)}):")
                    for j, source in enumerate(sources):
                        title = source.get("title", "Untitled")
                        url = source.get("url", "")
                        date = source.get("date", "")
                        date_str = f" ({date})" if date else ""
                        write_output(f"  [{j+1}] {title}{date_str}")
                        if url:
                            write_output(f"      {url}")

                # Display additional text-extracted citations
                citations = result.get("citations", [])
                text_citations = [c for c in citations if c.get("type") in ("doi", "url")]
                if text_citations:
                    write_output(f"\n🔗 Additional References ({len(text_citations)}):")
                    for j, citation in enumerate(text_citations):
                        if citation.get("type") == "doi":
                            write_output(f"  [{j+1}] DOI: {citation.get('doi', '')} - {citation.get('url', '')}")
                        elif citation.get("type") == "url":
                            write_output(f"  [{j+1}] {citation.get('url', '')}")

                if result.get("usage"):
                    write_output(f"\nUsage: {result['usage']}")
            else:
                write_output(f"\nError in query {i+1}: {result['error']}")

        if output_file:
            output_file.close()
        return 0

    except Exception as e:
        print(f"Error: {str(e)}", file=sys.stderr)
        if output_file:
            output_file.close()
        return 1


if __name__ == "__main__":
    exit(main())