Added parallel-web skill

Refactor research lookup skill to enhance backend routing and update documentation. The skill now intelligently selects between the Parallel Chat API and Perplexity sonar-pro-search based on query type. Added compatibility notes, license information, and improved descriptions for clarity. Removed outdated example scripts to streamline the codebase.
2026-03-27 07:09:27 +08:00 · 2026-03-01 07:36:19 -08:00
parent 29c869326e
commit f72b7f4521
13 changed files with 3969 additions and 769 deletions
--- a/scientific-skills/parallel-web/scripts/parallel_web.py
+++ b/scientific-skills/parallel-web/scripts/parallel_web.py
@@ -0,0 +1,568 @@
+#!/usr/bin/env python3
+"""
+Parallel Web Systems API Client
+
+Provides web search, URL content extraction, and deep research capabilities
+using the Parallel Web Systems APIs (https://docs.parallel.ai).
+
+Primary interface: Parallel Chat API (OpenAI-compatible) for search and research.
+Secondary interface: Extract API for URL verification and special cases.
+
+Main classes:
+  - ParallelChat:         Core Chat API client (base/core models)
+  - ParallelSearch:       Web search via Chat API (base model)
+  - ParallelDeepResearch: Deep research via Chat API (core model)
+  - ParallelExtract:      URL content extraction (Extract API, verification only)
+
+Environment variable required:
+  PARALLEL_API_KEY - Your Parallel API key from https://platform.parallel.ai
+"""
+
+import os
+import sys
+import json
+import argparse
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+
+def _get_api_key():
+    """Validate and return the Parallel API key."""
+    api_key = os.getenv("PARALLEL_API_KEY")
+    if not api_key:
+        raise ValueError(
+            "PARALLEL_API_KEY environment variable not set.\n"
+            "Get your key at https://platform.parallel.ai and set it:\n"
+            "  export PARALLEL_API_KEY='your_key_here'"
+        )
+    return api_key
+
+
+def _get_extract_client():
+    """Create and return a Parallel SDK client for the Extract API."""
+    try:
+        from parallel import Parallel
+    except ImportError:
+        raise ImportError(
+            "The 'parallel-web' package is required for extract. Install it with:\n"
+            "  pip install parallel-web"
+        )
+    return Parallel(api_key=_get_api_key())
+
+
+class ParallelChat:
+    """Core client for the Parallel Chat API.
+
+    OpenAI-compatible chat completions endpoint that performs web research
+    and returns synthesized responses with citations.
+
+    Models:
+      - base  : Standard research, factual queries (15-100s latency)
+      - core  : Complex research, multi-source synthesis (60s-5min latency)
+    """
+
+    CHAT_BASE_URL = "https://api.parallel.ai"
+
+    def __init__(self):
+        try:
+            from openai import OpenAI
+        except ImportError:
+            raise ImportError(
+                "The 'openai' package is required. Install it with:\n"
+                "  pip install openai"
+            )
+
+        self.client = OpenAI(
+            api_key=_get_api_key(),
+            base_url=self.CHAT_BASE_URL,
+        )
+
+    def query(
+        self,
+        user_message: str,
+        system_message: Optional[str] = None,
+        model: str = "base",
+    ) -> Dict[str, Any]:
+        """Send a query to the Parallel Chat API.
+
+        Args:
+            user_message: The research query or question.
+            system_message: Optional system prompt to guide response style.
+            model: Chat model to use ('base' or 'core').
+
+        Returns:
+            Dict with 'content' (response text), 'sources' (citations), and metadata.
+        """
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+        messages = []
+        if system_message:
+            messages.append({"role": "system", "content": system_message})
+        messages.append({"role": "user", "content": user_message})
+
+        try:
+            print(f"[Parallel Chat] Querying model={model}...", file=sys.stderr)
+
+            response = self.client.chat.completions.create(
+                model=model,
+                messages=messages,
+                stream=False,
+            )
+
+            content = ""
+            if response.choices and len(response.choices) > 0:
+                content = response.choices[0].message.content or ""
+
+            sources = self._extract_basis(response)
+
+            return {
+                "success": True,
+                "content": content,
+                "sources": sources,
+                "citation_count": len(sources),
+                "model": model,
+                "timestamp": timestamp,
+            }
+
+        except Exception as e:
+            return {
+                "success": False,
+                "error": str(e),
+                "model": model,
+                "timestamp": timestamp,
+            }
+
+    def _extract_basis(self, response) -> List[Dict[str, str]]:
+        """Extract citation sources from the Chat API research basis."""
+        sources = []
+        basis = getattr(response, "basis", None)
+        if not basis:
+            return sources
+
+        seen_urls = set()
+        if isinstance(basis, list):
+            for item in basis:
+                citations = (
+                    item.get("citations", []) if isinstance(item, dict)
+                    else getattr(item, "citations", None) or []
+                )
+                for cit in citations:
+                    url = cit.get("url", "") if isinstance(cit, dict) else getattr(cit, "url", "")
+                    if url and url not in seen_urls:
+                        seen_urls.add(url)
+                        title = cit.get("title", "") if isinstance(cit, dict) else getattr(cit, "title", "")
+                        excerpts = cit.get("excerpts", []) if isinstance(cit, dict) else getattr(cit, "excerpts", [])
+                        sources.append({
+                            "type": "source",
+                            "url": url,
+                            "title": title,
+                            "excerpts": excerpts,
+                        })
+
+        return sources
+
+
+class ParallelSearch:
+    """Web search using the Parallel Chat API (base model).
+
+    Sends a search query to the Chat API which performs web research and
+    returns a synthesized summary with cited sources.
+    """
+
+    SYSTEM_PROMPT = (
+        "You are a web research assistant. Search the web and synthesize information "
+        "about the user's query. Provide a clear, well-organized summary with:\n"
+        "- Key facts, data points, and statistics\n"
+        "- Specific names, dates, and numbers when available\n"
+        "- Multiple perspectives if the topic is debated\n"
+        "Cite your sources inline. Be comprehensive but concise."
+    )
+
+    def __init__(self):
+        self.chat = ParallelChat()
+
+    def search(
+        self,
+        objective: str,
+        model: str = "base",
+    ) -> Dict[str, Any]:
+        """Execute a web search via the Chat API.
+
+        Args:
+            objective: Natural language description of the search goal.
+            model: Chat model to use ('base' or 'core', default 'base').
+
+        Returns:
+            Dict with 'response' (synthesized text), 'sources', and metadata.
+        """
+        result = self.chat.query(
+            user_message=objective,
+            system_message=self.SYSTEM_PROMPT,
+            model=model,
+        )
+
+        if not result["success"]:
+            return {
+                "success": False,
+                "objective": objective,
+                "error": result.get("error", "Unknown error"),
+                "timestamp": result["timestamp"],
+            }
+
+        return {
+            "success": True,
+            "objective": objective,
+            "response": result["content"],
+            "sources": result["sources"],
+            "citation_count": result["citation_count"],
+            "model": result["model"],
+            "backend": "parallel-chat",
+            "timestamp": result["timestamp"],
+        }
+
+
+class ParallelExtract:
+    """Extract clean content from URLs using Parallel's Extract API.
+
+    Converts any public URL into clean, LLM-optimized markdown.
+    Use for citation verification and special cases only.
+    For general research, use ParallelSearch or ParallelDeepResearch instead.
+    """
+
+    def __init__(self):
+        self.client = _get_extract_client()
+
+    def extract(
+        self,
+        urls: List[str],
+        objective: Optional[str] = None,
+        excerpts: bool = True,
+        full_content: bool = False,
+    ) -> Dict[str, Any]:
+        """Extract content from one or more URLs.
+
+        Args:
+            urls: List of URLs to extract content from.
+            objective: Optional objective to focus extraction.
+            excerpts: Whether to return focused excerpts (default True).
+            full_content: Whether to return full page content (default False).
+
+        Returns:
+            Dict with 'results' list containing url, title, excerpts/content.
+        """
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+        kwargs = {
+            "urls": urls,
+            "excerpts": excerpts,
+            "full_content": full_content,
+        }
+        if objective:
+            kwargs["objective"] = objective
+
+        try:
+            response = self.client.beta.extract(**kwargs)
+
+            results = []
+            if hasattr(response, "results") and response.results:
+                for r in response.results:
+                    result = {
+                        "url": getattr(r, "url", ""),
+                        "title": getattr(r, "title", ""),
+                        "publish_date": getattr(r, "publish_date", None),
+                        "excerpts": getattr(r, "excerpts", []),
+                        "full_content": getattr(r, "full_content", None),
+                    }
+                    results.append(result)
+
+            errors = []
+            if hasattr(response, "errors") and response.errors:
+                errors = [str(e) for e in response.errors]
+
+            return {
+                "success": True,
+                "urls": urls,
+                "results": results,
+                "errors": errors,
+                "timestamp": timestamp,
+                "extract_id": getattr(response, "extract_id", None),
+            }
+
+        except Exception as e:
+            return {
+                "success": False,
+                "urls": urls,
+                "error": str(e),
+                "timestamp": timestamp,
+            }
+
+
+class ParallelDeepResearch:
+    """Deep research using the Parallel Chat API (core model).
+
+    Sends complex research queries to the Chat API which performs
+    multi-source web research and returns comprehensive reports with citations.
+    """
+
+    SYSTEM_PROMPT = (
+        "You are a deep research analyst. Provide a comprehensive, well-structured "
+        "research report on the user's topic. Include:\n"
+        "- Executive summary of key findings\n"
+        "- Detailed analysis organized by themes\n"
+        "- Specific data, statistics, and quantitative evidence\n"
+        "- Multiple authoritative sources\n"
+        "- Implications and future outlook where relevant\n"
+        "Use markdown formatting with clear section headers. "
+        "Cite all sources inline."
+    )
+
+    def __init__(self):
+        self.chat = ParallelChat()
+
+    def research(
+        self,
+        query: str,
+        model: str = "core",
+        system_prompt: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """Run deep research via the Chat API.
+
+        Args:
+            query: The research question or topic.
+            model: Chat model to use ('base' or 'core', default 'core').
+            system_prompt: Optional override for the system prompt.
+
+        Returns:
+            Dict with 'response' (markdown report), 'citations', and metadata.
+        """
+        result = self.chat.query(
+            user_message=query,
+            system_message=system_prompt or self.SYSTEM_PROMPT,
+            model=model,
+        )
+
+        if not result["success"]:
+            return {
+                "success": False,
+                "query": query,
+                "error": result.get("error", "Unknown error"),
+                "model": model,
+                "timestamp": result["timestamp"],
+            }
+
+        return {
+            "success": True,
+            "query": query,
+            "response": result["content"],
+            "output": result["content"],
+            "citations": result["sources"],
+            "sources": result["sources"],
+            "citation_count": result["citation_count"],
+            "model": model,
+            "backend": "parallel-chat",
+            "timestamp": result["timestamp"],
+        }
+
+
+# ---------------------------------------------------------------------------
+# CLI Interface
+# ---------------------------------------------------------------------------
+
+def _print_search_results(result: Dict[str, Any], output_file=None):
+    """Print search results (synthesized summary + sources)."""
+    def write(text):
+        if output_file:
+            output_file.write(text + "\n")
+        else:
+            print(text)
+
+    if not result["success"]:
+        write(f"Error: {result.get('error', 'Unknown error')}")
+        return
+
+    write(f"\n{'='*80}")
+    write(f"Search: {result['objective']}")
+    write(f"Model: {result['model']} | Time: {result['timestamp']}")
+    write(f"{'='*80}\n")
+
+    write(result.get("response", "No response received."))
+
+    sources = result.get("sources", [])
+    if sources:
+        write(f"\n\n{'='*40} SOURCES {'='*40}")
+        for i, src in enumerate(sources):
+            title = src.get("title", "Untitled")
+            url = src.get("url", "")
+            write(f"  [{i+1}] {title}")
+            if url:
+                write(f"      {url}")
+
+
+def _print_extract_results(result: Dict[str, Any], output_file=None):
+    """Pretty-print extract results."""
+    def write(text):
+        if output_file:
+            output_file.write(text + "\n")
+        else:
+            print(text)
+
+    if not result["success"]:
+        write(f"Error: {result.get('error', 'Unknown error')}")
+        return
+
+    write(f"\n{'='*80}")
+    write(f"Extracted from: {', '.join(result['urls'])}")
+    write(f"Time: {result['timestamp']}")
+    write(f"{'='*80}")
+
+    for i, r in enumerate(result["results"]):
+        write(f"\n--- [{i+1}] {r['title']} ---")
+        write(f"URL: {r['url']}")
+        if r.get("full_content"):
+            write(f"\n{r['full_content']}")
+        elif r.get("excerpts"):
+            for j, excerpt in enumerate(r["excerpts"]):
+                write(f"\nExcerpt {j+1}:")
+                write(excerpt[:2000] if len(excerpt) > 2000 else excerpt)
+
+    if result.get("errors"):
+        write(f"\nErrors: {result['errors']}")
+
+
+def _print_research_results(result: Dict[str, Any], output_file=None):
+    """Print deep research results (report + sources)."""
+    def write(text):
+        if output_file:
+            output_file.write(text + "\n")
+        else:
+            print(text)
+
+    if not result["success"]:
+        write(f"Error: {result.get('error', 'Unknown error')}")
+        return
+
+    write(f"\n{'='*80}")
+    query_display = result['query'][:100]
+    if len(result['query']) > 100:
+        query_display += "..."
+    write(f"Research: {query_display}")
+    write(f"Model: {result['model']} | Citations: {result.get('citation_count', 0)} | Time: {result['timestamp']}")
+    write(f"{'='*80}\n")
+
+    write(result.get("response", result.get("output", "No output received.")))
+
+    citations = result.get("citations", result.get("sources", []))
+    if citations:
+        write(f"\n\n{'='*40} SOURCES {'='*40}")
+        seen_urls = set()
+        for cit in citations:
+            url = cit.get("url", "")
+            if url and url not in seen_urls:
+                seen_urls.add(url)
+                title = cit.get("title", "Untitled")
+                write(f"  [{len(seen_urls)}] {title}")
+                write(f"      {url}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Parallel Web Systems API Client - Search, Extract, and Deep Research",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python parallel_web.py search "latest advances in quantum computing"
+  python parallel_web.py search "climate policy 2025" --model core
+  python parallel_web.py extract "https://example.com" --objective "key findings"
+  python parallel_web.py research "comprehensive analysis of EV battery market"
+  python parallel_web.py research "compare mRNA vs protein subunit vaccines" --model base
+  python parallel_web.py research "AI regulation landscape 2025" -o report.md
+        """,
+    )
+
+    subparsers = parser.add_subparsers(dest="command", help="API command")
+
+    # --- search subcommand ---
+    search_parser = subparsers.add_parser("search", help="Web search via Chat API (synthesized results)")
+    search_parser.add_argument("objective", help="Natural language search objective")
+    search_parser.add_argument("--model", default="base", choices=["base", "core"],
+                               help="Chat model to use (default: base)")
+    search_parser.add_argument("-o", "--output", help="Write output to file")
+    search_parser.add_argument("--json", action="store_true", help="Output as JSON")
+
+    # --- extract subcommand ---
+    extract_parser = subparsers.add_parser("extract", help="Extract content from URLs (verification only)")
+    extract_parser.add_argument("urls", nargs="+", help="One or more URLs to extract")
+    extract_parser.add_argument("--objective", help="Objective to focus extraction")
+    extract_parser.add_argument("--full-content", action="store_true", help="Return full page content")
+    extract_parser.add_argument("-o", "--output", help="Write output to file")
+    extract_parser.add_argument("--json", action="store_true", help="Output as JSON")
+
+    # --- research subcommand ---
+    research_parser = subparsers.add_parser("research", help="Deep research via Chat API (comprehensive report)")
+    research_parser.add_argument("query", help="Research question or topic")
+    research_parser.add_argument("--model", default="core", choices=["base", "core"],
+                                 help="Chat model to use (default: core)")
+    research_parser.add_argument("-o", "--output", help="Write output to file")
+    research_parser.add_argument("--json", action="store_true", help="Output as JSON")
+
+    args = parser.parse_args()
+
+    if not args.command:
+        parser.print_help()
+        return 1
+
+    output_file = None
+    if hasattr(args, "output") and args.output:
+        output_file = open(args.output, "w", encoding="utf-8")
+
+    try:
+        if args.command == "search":
+            searcher = ParallelSearch()
+            result = searcher.search(
+                objective=args.objective,
+                model=args.model,
+            )
+            if args.json:
+                text = json.dumps(result, indent=2, ensure_ascii=False, default=str)
+                (output_file or sys.stdout).write(text + "\n")
+            else:
+                _print_search_results(result, output_file)
+
+        elif args.command == "extract":
+            extractor = ParallelExtract()
+            result = extractor.extract(
+                urls=args.urls,
+                objective=args.objective,
+                full_content=args.full_content,
+            )
+            if args.json:
+                text = json.dumps(result, indent=2, ensure_ascii=False, default=str)
+                (output_file or sys.stdout).write(text + "\n")
+            else:
+                _print_extract_results(result, output_file)
+
+        elif args.command == "research":
+            researcher = ParallelDeepResearch()
+            result = researcher.research(
+                query=args.query,
+                model=args.model,
+            )
+            if args.json:
+                text = json.dumps(result, indent=2, ensure_ascii=False, default=str)
+                (output_file or sys.stdout).write(text + "\n")
+            else:
+                _print_research_results(result, output_file)
+
+        return 0
+
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        return 1
+
+    finally:
+        if output_file:
+            output_file.close()
+
+
+if __name__ == "__main__":
+    sys.exit(main())