Initial commit for citation-management

2026-01-29 22:15:01 +08:00
commit 8ca118d266
14 changed files with 8194 additions and 0 deletions
--- a/SKILL.md
+++ b/SKILL.md
--- a/assets/bibtex_template.bib
+++ b/assets/bibtex_template.bib
@@ -0,0 +1,264 @@
 % BibTeX Template File
 % Examples of properly formatted entries for all common types
 % =============================================================================
 % JOURNAL ARTICLES
 % =============================================================================
@article{Jumper2021,
  author  = {Jumper, John and Evans, Richard and Pritzel, Alexander and Green, Tim and Figurnov, Michael and Ronneberger, Olaf and Tunyasuvunakool, Kathryn and Bates, Russ and {\v{Z}}{\'\i}dek, Augustin and Potapenko, Anna and others},
  title   = {Highly Accurate Protein Structure Prediction with {AlphaFold}},
  journal = {Nature},
  year    = {2021},
  volume  = {596},
  number  = {7873},
  pages   = {583--589},
  doi     = {10.1038/s41586-021-03819-2}
 }
@article{Watson1953,
  author  = {Watson, James D. and Crick, Francis H. C.},
  title   = {Molecular Structure of Nucleic Acids: A Structure for Deoxyribose Nucleic Acid},
  journal = {Nature},
  year    = {1953},
  volume  = {171},
  number  = {4356},
  pages   = {737--738},
  doi     = {10.1038/171737a0}
 }
@article{Doudna2014,
  author  = {Doudna, Jennifer A. and Charpentier, Emmanuelle},
  title   = {The New Frontier of Genome Engineering with {CRISPR-Cas9}},
  journal = {Science},
  year    = {2014},
  volume  = {346},
  number  = {6213},
  pages   = {1258096},
  doi     = {10.1126/science.1258096}
 }
 % =============================================================================
 % BOOKS
 % =============================================================================
@book{Kumar2021,
  author    = {Kumar, Vinay and Abbas, Abul K. and Aster, Jon C.},
  title     = {Robbins and Cotran Pathologic Basis of Disease},
  publisher = {Elsevier},
  year      = {2021},
  edition   = {10},
  address   = {Philadelphia, PA},
  isbn      = {978-0-323-53113-9}
 }
@book{Alberts2014,
  author    = {Alberts, Bruce and Johnson, Alexander and Lewis, Julian and Morgan, David and Raff, Martin and Roberts, Keith and Walter, Peter},
  title     = {Molecular Biology of the Cell},
  publisher = {Garland Science},
  year      = {2014},
  edition   = {6},
  address   = {New York, NY},
  isbn      = {978-0-815-34432-2}
 }
 % Book with editor instead of author
@book{Sambrook2001,
  editor    = {Sambrook, Joseph and Russell, David W.},
  title     = {Molecular Cloning: A Laboratory Manual},
  publisher = {Cold Spring Harbor Laboratory Press},
  year      = {2001},
  edition   = {3},
  address   = {Cold Spring Harbor, NY},
  isbn      = {978-0-879-69576-7}
 }
 % =============================================================================
 % CONFERENCE PAPERS (PROCEEDINGS)
 % =============================================================================
@inproceedings{Vaswani2017,
  author    = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, {\L}ukasz and Polosukhin, Illia},
  title     = {Attention is All You Need},
  booktitle = {Advances in Neural Information Processing Systems 30 (NeurIPS 2017)},
  year      = {2017},
  pages     = {5998--6008},
  address   = {Long Beach, CA},
  url       = {https://proceedings.neurips.cc/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html}
 }
@inproceedings{He2016,
  author    = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  title     = {Deep Residual Learning for Image Recognition},
  booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  year      = {2016},
  pages     = {770--778},
  address   = {Las Vegas, NV},
  doi       = {10.1109/CVPR.2016.90}
 }
 % =============================================================================
 % BOOK CHAPTERS
 % =============================================================================
@incollection{Brown2020,
  author    = {Brown, Peter O. and Botstein, David},
  title     = {Exploring the New World of the Genome with {DNA} Microarrays},
  booktitle = {DNA Microarrays: A Molecular Cloning Manual},
  editor    = {Eisen, Michael B. and Brown, Patrick O.},
  publisher = {Cold Spring Harbor Laboratory Press},
  year      = {2020},
  pages     = {1--45},
  address   = {Cold Spring Harbor, NY}
 }
 % =============================================================================
 % PHD THESES / DISSERTATIONS
 % =============================================================================
@phdthesis{Johnson2023,
  author  = {Johnson, Mary L.},
  title   = {Novel Approaches to Cancer Immunotherapy Using {CRISPR} Technology},
  school  = {Stanford University},
  year    = {2023},
  type    = {{PhD} dissertation},
  address = {Stanford, CA}
 }
 % =============================================================================
 % MASTER'S THESES
 % =============================================================================
@mastersthesis{Smith2022,
  author  = {Smith, Robert J.},
  title   = {Machine Learning Methods for Protein Structure Prediction},
  school  = {Massachusetts Institute of Technology},
  year    = {2022},
  type    = {{Master's} thesis},
  address = {Cambridge, MA}
 }
 % =============================================================================
 % TECHNICAL REPORTS
 % =============================================================================
@techreport{WHO2020,
  author      = {{World Health Organization}},
  title       = {Clinical Management of {COVID-19}: Interim Guidance},
  institution = {World Health Organization},
  year        = {2020},
  type        = {Technical Report},
  number      = {WHO/2019-nCoV/clinical/2020.5},
  address     = {Geneva, Switzerland}
 }
 % =============================================================================
 % PREPRINTS
 % =============================================================================
 % bioRxiv preprint
@misc{Zhang2024preprint,
  author       = {Zhang, Yi and Chen, Li and Wang, Hui and Liu, Xin},
  title        = {Novel Therapeutic Targets in {Alzheimer}'s Disease},
  year         = {2024},
  howpublished = {bioRxiv},
  doi          = {10.1101/2024.01.15.575432},
  note         = {Preprint}
 }
 % arXiv preprint
@misc{Brown2024arxiv,
  author       = {Brown, Alice and Green, Bob},
  title        = {Advances in Quantum Computing},
  year         = {2024},
  howpublished = {arXiv},
  note         = {arXiv:2401.12345}
 }
 % =============================================================================
 % DATASETS
 % =============================================================================
@misc{AlphaFoldDB2021,
  author       = {{DeepMind} and {EMBL-EBI}},
  title        = {{AlphaFold} Protein Structure Database},
  year         = {2021},
  howpublished = {Database},
  url          = {https://alphafold.ebi.ac.uk/},
  doi          = {10.1093/nar/gkab1061},
  note         = {Version 4}
 }
 % =============================================================================
 % SOFTWARE / CODE
 % =============================================================================
@misc{McKinney2010pandas,
  author       = {McKinney, Wes},
  title        = {pandas: A Foundational {Python} Library for Data Analysis and Statistics},
  year         = {2010},
  howpublished = {Software},
  url          = {https://pandas.pydata.org/},
  note         = {Python Data Analysis Library}
 }
 % =============================================================================
 % WEBSITES / ONLINE RESOURCES
 % =============================================================================
@misc{NCBI2024,
  author       = {{National Center for Biotechnology Information}},
  title        = {{PubMed}: Database of Biomedical Literature},
  year         = {2024},
  howpublished = {Website},
  url          = {https://pubmed.ncbi.nlm.nih.gov/},
  note         = {Accessed: 2024-01-15}
 }
 % =============================================================================
 % SPECIAL CASES
 % =============================================================================
 % Article with organization as author
@article{NatureEditorial2023,
  author  = {{Nature Editorial Board}},
  title   = {The Future of {AI} in Scientific Research},
  journal = {Nature},
  year    = {2023},
  volume  = {615},
  pages   = {1--2},
  doi     = {10.1038/d41586-023-00001-1}
 }
 % Article with no volume number (some journals)
@article{OpenAccess2024,
  author  = {Williams, Sarah and Thomas, Michael},
  title   = {Open Access Publishing in the 21st Century},
  journal = {Journal of Scholarly Communication},
  year    = {2024},
  pages   = {e123456},
  doi     = {10.1234/jsc.2024.123456}
 }
 % Conference paper with DOI
@inproceedings{Garcia2023,
  author    = {Garc{\'i}a-Mart{\'i}nez, Jos{\'e} and M{\"u}ller, Hans},
  title     = {International Collaboration in Science},
  booktitle = {Proceedings of the International Conference on Academic Publishing},
  year      = {2023},
  pages     = {45--52},
  doi       = {10.1109/ICAP.2023.123456}
 }
 % Article with PMID but no DOI (older papers)
@article{OldPaper1995,
  author  = {Anderson, Philip W.},
  title   = {Through the Glass Lightly},
  journal = {Science},
  year    = {1995},
  volume  = {267},
  number  = {5204},
  pages   = {1615--1616},
  note    = {PMID: 17808148}
 }
--- a/assets/citation_checklist.md
+++ b/assets/citation_checklist.md
@@ -0,0 +1,386 @@
 # Citation Quality Checklist
 Use this checklist to ensure your citations are accurate, complete, and properly formatted before final submission.
 ## Pre-Submission Checklist
 ### ✓ Metadata Accuracy
 - [ ] All author names are correct and properly formatted
 - [ ] Article titles match the actual publication
 - [ ] Journal/conference names are complete (not abbreviated unless required)
 - [ ] Publication years are accurate
 - [ ] Volume and issue numbers are correct
 - [ ] Page ranges are accurate
 ### ✓ Required Fields
 - [ ] All @article entries have: author, title, journal, year
 - [ ] All @book entries have: author/editor, title, publisher, year
 - [ ] All @inproceedings entries have: author, title, booktitle, year
 - [ ] Modern papers (2000+) include DOI when available
 - [ ] All entries have unique citation keys
 ### ✓ DOI Verification
 - [ ] All DOIs are properly formatted (10.XXXX/...)
 - [ ] DOIs resolve correctly to the article
 - [ ] No DOI prefix in the BibTeX field (no "doi:" or "https://doi.org/")
 - [ ] Metadata from CrossRef matches your BibTeX entry
 - [ ] Run: `python scripts/validate_citations.py references.bib --check-dois`
 ### ✓ Formatting Consistency
 - [ ] Page ranges use double hyphen (--) not single (-)
 - [ ] No "pp." prefix in pages field
 - [ ] Author names use "and" separator (not semicolon or ampersand)
 - [ ] Capitalization protected in titles ({AlphaFold}, {CRISPR}, etc.)
 - [ ] Month names use standard abbreviations if included
 - [ ] Citation keys follow consistent format
 ### ✓ Duplicate Detection
 - [ ] No duplicate DOIs in bibliography
 - [ ] No duplicate citation keys
 - [ ] No near-duplicate titles
 - [ ] Preprints updated to published versions when available
 - [ ] Run: `python scripts/validate_citations.py references.bib`
 ### ✓ Special Characters
 - [ ] Accented characters properly formatted (e.g., {\"u} for ü)
 - [ ] Mathematical symbols use LaTeX commands
 - [ ] Chemical formulas properly formatted
 - [ ] No unescaped special characters (%, &, $, #, etc.)
 ### ✓ BibTeX Syntax
 - [ ] All entries have balanced braces {}
 - [ ] Fields separated by commas
 - [ ] No comma after last field in each entry
 - [ ] Valid entry types (@article, @book, etc.)
 - [ ] Run: `python scripts/validate_citations.py references.bib`
 ### ✓ File Organization
 - [ ] Bibliography sorted in logical order (by year, author, or key)
 - [ ] Consistent formatting throughout
 - [ ] No formatting inconsistencies between entries
 - [ ] Run: `python scripts/format_bibtex.py references.bib --sort year`
 ## Automated Validation
 ### Step 1: Format and Clean
 ```bash
 python scripts/format_bibtex.py references.bib \
  --deduplicate \
  --sort year \
  --descending \
  --output clean_references.bib
 ```
 **What this does**:
 - Removes duplicates
 - Standardizes formatting
 - Fixes common issues (page ranges, DOI format, etc.)
 - Sorts by year (newest first)
 ### Step 2: Validate
 ```bash
 python scripts/validate_citations.py clean_references.bib \
  --check-dois \
  --report validation_report.json \
  --verbose
 ```
 **What this does**:
 - Checks required fields
 - Verifies DOIs resolve
 - Detects duplicates
 - Validates syntax
 - Generates detailed report
 ### Step 3: Review Report
 ```bash
 cat validation_report.json
 ```
 **Address any**:
 - **Errors**: Must fix (missing fields, broken DOIs, syntax errors)
 - **Warnings**: Should fix (missing recommended fields, formatting issues)
 - **Duplicates**: Remove or consolidate
 ### Step 4: Final Check
 ```bash
 python scripts/validate_citations.py clean_references.bib --verbose
 ```
 **Goal**: Zero errors, minimal warnings
 ## Manual Review Checklist
 ### Critical Citations (Top 10-20 Most Important)
 For your most important citations, manually verify:
 - [ ] Visit DOI link and confirm it's the correct article
 - [ ] Check author names against the actual publication
 - [ ] Verify year matches publication date
 - [ ] Confirm journal/conference name is correct
 - [ ] Check that volume/pages match
 ### Common Issues to Watch For
 **Missing Information**:
 - [ ] No DOI for papers published after 2000
 - [ ] Missing volume or page numbers for journal articles
 - [ ] Missing publisher for books
 - [ ] Missing conference location for proceedings
 **Formatting Errors**:
 - [ ] Single hyphen in page ranges (123-145 → 123--145)
 - [ ] Ampersands in author lists (Smith & Jones → Smith and Jones)
 - [ ] Unprotected acronyms in titles (DNA → {DNA})
 - [ ] DOI includes URL prefix (https://doi.org/10.xxx → 10.xxx)
 **Metadata Mismatches**:
 - [ ] Author names differ from publication
 - [ ] Year is online-first instead of print publication
 - [ ] Journal name abbreviated when it should be full
 - [ ] Volume/issue numbers swapped
 **Duplicates**:
 - [ ] Same paper cited with different citation keys
 - [ ] Preprint and published version both cited
 - [ ] Conference paper and journal version both cited
 ## Field-Specific Checks
 ### Biomedical Sciences
 - [ ] PubMed Central ID (PMCID) included when available
 - [ ] MeSH terms appropriate (if using)
 - [ ] Clinical trial registration number included (if applicable)
 - [ ] All references to treatments/drugs accurately cited
 ### Computer Science
 - [ ] arXiv ID included for preprints
 - [ ] Conference proceedings properly cited (not just "NeurIPS")
 - [ ] Software/dataset citations include version numbers
 - [ ] GitHub links stable and permanent
 ### General Sciences
 - [ ] Data availability statements properly cited
 - [ ] Retracted papers identified and removed
 - [ ] Preprints checked for published versions
 - [ ] Supplementary materials referenced if critical
 ## Final Pre-Submission Steps
 ### 1 Week Before Submission
 - [ ] Run full validation with DOI checking
 - [ ] Fix all errors and critical warnings
 - [ ] Manually verify top 10-20 most important citations
 - [ ] Check for any retracted papers
 ### 3 Days Before Submission
 - [ ] Re-run validation after any manual edits
 - [ ] Ensure all in-text citations have corresponding bibliography entries
 - [ ] Ensure all bibliography entries are cited in text
 - [ ] Check citation style matches journal requirements
 ### 1 Day Before Submission
 - [ ] Final validation check
 - [ ] LaTeX compilation successful with no warnings
 - [ ] PDF renders all citations correctly
 - [ ] Bibliography appears in correct format
 - [ ] No placeholder citations (Smith et al. XXXX)
 ### Submission Day
 - [ ] One final validation run
 - [ ] No last-minute edits without re-validation
 - [ ] Bibliography file included in submission package
 - [ ] Figures/tables referenced in text match bibliography
 ## Quality Metrics
 ### Excellent Bibliography
 - ✓ 100% of entries have DOIs (for modern papers)
 - ✓ Zero validation errors
 - ✓ Zero missing required fields
 - ✓ Zero broken DOIs
 - ✓ Zero duplicates
 - ✓ Consistent formatting throughout
 - ✓ All citations manually spot-checked
 ### Acceptable Bibliography
 - ✓ 90%+ of modern entries have DOIs
 - ✓ Zero high-severity errors
 - ✓ Minor warnings only (e.g., missing recommended fields)
 - ✓ Key citations manually verified
 - ✓ Compilation succeeds without errors
 ### Needs Improvement
 - ✗ Missing DOIs for recent papers
 - ✗ High-severity validation errors
 - ✗ Broken or incorrect DOIs
 - ✗ Duplicate entries
 - ✗ Inconsistent formatting
 - ✗ Compilation warnings or errors
 ## Emergency Fixes
 If you discover issues at the last minute:
 ### Broken DOI
 ```bash
 # Find correct DOI
 # Option 1: Search CrossRef
 # https://www.crossref.org/
 # Option 2: Search on publisher website
 # Option 3: Google Scholar
 # Re-extract metadata
 python scripts/extract_metadata.py --doi CORRECT_DOI
 ```
 ### Missing Information
 ```bash
 # Extract from DOI
 python scripts/extract_metadata.py --doi 10.xxxx/yyyy
 # Or from PMID (biomedical)
 python scripts/extract_metadata.py --pmid 12345678
 # Or from arXiv
 python scripts/extract_metadata.py --arxiv 2103.12345
 ```
 ### Duplicate Entries
 ```bash
 # Auto-remove duplicates
 python scripts/format_bibtex.py references.bib \
  --deduplicate \
  --output fixed_references.bib
 ```
 ### Formatting Errors
 ```bash
 # Auto-fix common issues
 python scripts/format_bibtex.py references.bib \
  --output fixed_references.bib
 # Then validate
 python scripts/validate_citations.py fixed_references.bib
 ```
 ## Long-Term Best Practices
 ### During Research
 - [ ] Add citations to bibliography file as you find them
 - [ ] Extract metadata immediately using DOI
 - [ ] Validate after every 10-20 additions
 - [ ] Keep bibliography file under version control
 ### During Writing
 - [ ] Cite as you write
 - [ ] Use consistent citation keys
 - [ ] Don't delay adding references
 - [ ] Validate weekly
 ### Before Submission
 - [ ] Allow 2-3 days for citation cleanup
 - [ ] Don't wait until the last day
 - [ ] Automate what you can
 - [ ] Manually verify critical citations
 ## Tool Quick Reference
 ### Extract Metadata
 ```bash
 # From DOI
 python scripts/doi_to_bibtex.py 10.1038/nature12345
 # From multiple sources
 python scripts/extract_metadata.py \
  --doi 10.1038/nature12345 \
  --pmid 12345678 \
  --arxiv 2103.12345 \
  --output references.bib
 ```
 ### Validate
 ```bash
 # Basic validation
 python scripts/validate_citations.py references.bib
 # With DOI checking (slow but thorough)
 python scripts/validate_citations.py references.bib --check-dois
 # Generate report
 python scripts/validate_citations.py references.bib \
  --report validation.json \
  --verbose
 ```
 ### Format and Clean
 ```bash
 # Format and fix issues
 python scripts/format_bibtex.py references.bib
 # Remove duplicates and sort
 python scripts/format_bibtex.py references.bib \
  --deduplicate \
  --sort year \
  --descending \
  --output clean_refs.bib
 ```
 ## Summary
 **Minimum Requirements**:
 1. Run `format_bibtex.py --deduplicate`
 2. Run `validate_citations.py`
 3. Fix all errors
 4. Compile successfully
 **Recommended**:
 1. Format, deduplicate, and sort
 2. Validate with `--check-dois`
 3. Fix all errors and warnings
 4. Manually verify top citations
 5. Re-validate after fixes
 **Best Practice**:
 1. Validate throughout research process
 2. Use automated tools consistently
 3. Keep bibliography clean and organized
 4. Document any special cases
 5. Final validation 1-3 days before submission
 **Remember**: Citation errors reflect poorly on your scholarship. Taking time to ensure accuracy is worthwhile!
--- a/references/bibtex_formatting.md
+++ b/references/bibtex_formatting.md
@@ -0,0 +1,908 @@
 # BibTeX Formatting Guide
 Comprehensive guide to BibTeX entry types, required fields, formatting conventions, and best practices.
 ## Overview
 BibTeX is the standard bibliography format for LaTeX documents. Proper formatting ensures:
 - Correct citation rendering
 - Consistent formatting
 - Compatibility with citation styles
 - No compilation errors
 This guide covers all common entry types and formatting rules.
 ## Entry Types
 ### @article - Journal Articles
 **Most common entry type** for peer-reviewed journal articles.
 **Required fields**:
 - `author`: Author names
 - `title`: Article title
 - `journal`: Journal name
 - `year`: Publication year
 **Optional fields**:
 - `volume`: Volume number
 - `number`: Issue number
 - `pages`: Page range
 - `month`: Publication month
 - `doi`: Digital Object Identifier
 - `url`: URL
 - `note`: Additional notes
 **Template**:
 ```bibtex
@article{CitationKey2024,
  author  = {Last1, First1 and Last2, First2},
  title   = {Article Title Here},
  journal = {Journal Name},
  year    = {2024},
  volume  = {10},
  number  = {3},
  pages   = {123--145},
  doi     = {10.1234/journal.2024.123456},
  month   = jan
 }
 ```
 **Example**:
 ```bibtex
@article{Jumper2021,
  author  = {Jumper, John and Evans, Richard and Pritzel, Alexander and others},
  title   = {Highly Accurate Protein Structure Prediction with {AlphaFold}},
  journal = {Nature},
  year    = {2021},
  volume  = {596},
  number  = {7873},
  pages   = {583--589},
  doi     = {10.1038/s41586-021-03819-2}
 }
 ```
 ### @book - Books
 **For entire books**.
 **Required fields**:
 - `author` OR `editor`: Author(s) or editor(s)
 - `title`: Book title
 - `publisher`: Publisher name
 - `year`: Publication year
 **Optional fields**:
 - `volume`: Volume number (if multi-volume)
 - `series`: Series name
 - `address`: Publisher location
 - `edition`: Edition number
 - `isbn`: ISBN
 - `url`: URL
 **Template**:
 ```bibtex
@book{CitationKey2024,
  author    = {Last, First},
  title     = {Book Title},
  publisher = {Publisher Name},
  year      = {2024},
  edition   = {3},
  address   = {City, Country},
  isbn      = {978-0-123-45678-9}
 }
 ```
 **Example**:
 ```bibtex
@book{Kumar2021,
  author    = {Kumar, Vinay and Abbas, Abul K. and Aster, Jon C.},
  title     = {Robbins and Cotran Pathologic Basis of Disease},
  publisher = {Elsevier},
  year      = {2021},
  edition   = {10},
  address   = {Philadelphia, PA},
  isbn      = {978-0-323-53113-9}
 }
 ```
 ### @inproceedings - Conference Papers
 **For papers in conference proceedings**.
 **Required fields**:
 - `author`: Author names
 - `title`: Paper title
 - `booktitle`: Conference/proceedings name
 - `year`: Year
 **Optional fields**:
 - `editor`: Proceedings editor(s)
 - `volume`: Volume number
 - `series`: Series name
 - `pages`: Page range
 - `address`: Conference location
 - `month`: Conference month
 - `organization`: Organizing body
 - `publisher`: Publisher
 - `doi`: DOI
 **Template**:
 ```bibtex
@inproceedings{CitationKey2024,
  author    = {Last, First},
  title     = {Paper Title},
  booktitle = {Proceedings of Conference Name},
  year      = {2024},
  pages     = {123--145},
  address   = {City, Country},
  month     = jun
 }
 ```
 **Example**:
 ```bibtex
@inproceedings{Vaswani2017,
  author    = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and others},
  title     = {Attention is All You Need},
  booktitle = {Advances in Neural Information Processing Systems 30 (NeurIPS 2017)},
  year      = {2017},
  pages     = {5998--6008},
  address   = {Long Beach, CA}
 }
 ```
 **Note**: `@conference` is an alias for `@inproceedings`.
 ### @incollection - Book Chapters
 **For chapters in edited books**.
 **Required fields**:
 - `author`: Chapter author(s)
 - `title`: Chapter title
 - `booktitle`: Book title
 - `publisher`: Publisher name
 - `year`: Publication year
 **Optional fields**:
 - `editor`: Book editor(s)
 - `volume`: Volume number
 - `series`: Series name
 - `type`: Type of section (e.g., "chapter")
 - `chapter`: Chapter number
 - `pages`: Page range
 - `address`: Publisher location
 - `edition`: Edition
 - `month`: Month
 **Template**:
 ```bibtex
@incollection{CitationKey2024,
  author    = {Last, First},
  title     = {Chapter Title},
  booktitle = {Book Title},
  editor    = {Editor, Last and Editor2, Last},
  publisher = {Publisher Name},
  year      = {2024},
  pages     = {123--145},
  chapter   = {5}
 }
 ```
 **Example**:
 ```bibtex
@incollection{Brown2020,
  author    = {Brown, Peter O. and Botstein, David},
  title     = {Exploring the New World of the Genome with {DNA} Microarrays},
  booktitle = {DNA Microarrays: A Molecular Cloning Manual},
  editor    = {Eisen, Michael B. and Brown, Patrick O.},
  publisher = {Cold Spring Harbor Laboratory Press},
  year      = {2020},
  pages     = {1--45},
  address   = {Cold Spring Harbor, NY}
 }
 ```
 ### @phdthesis - Doctoral Dissertations
 **For PhD dissertations and theses**.
 **Required fields**:
 - `author`: Author name
 - `title`: Thesis title
 - `school`: Institution
 - `year`: Year
 **Optional fields**:
 - `type`: Type (e.g., "PhD dissertation", "PhD thesis")
 - `address`: Institution location
 - `month`: Month
 - `url`: URL
 - `note`: Additional notes
 **Template**:
 ```bibtex
@phdthesis{CitationKey2024,
  author = {Last, First},
  title  = {Dissertation Title},
  school = {University Name},
  year   = {2024},
  type   = {{PhD} dissertation},
  address = {City, State}
 }
 ```
 **Example**:
 ```bibtex
@phdthesis{Johnson2023,
  author  = {Johnson, Mary L.},
  title   = {Novel Approaches to Cancer Immunotherapy Using {CRISPR} Technology},
  school  = {Stanford University},
  year    = {2023},
  type    = {{PhD} dissertation},
  address = {Stanford, CA}
 }
 ```
 **Note**: `@mastersthesis` is similar but for Master's theses.
 ### @mastersthesis - Master's Theses
 **For Master's theses**.
 **Required fields**:
 - `author`: Author name
 - `title`: Thesis title
 - `school`: Institution
 - `year`: Year
 **Template**:
 ```bibtex
@mastersthesis{CitationKey2024,
  author = {Last, First},
  title  = {Thesis Title},
  school = {University Name},
  year   = {2024}
 }
 ```
 ### @misc - Miscellaneous
 **For items that don't fit other categories** (preprints, datasets, software, websites, etc.).
 **Required fields**:
 - `author` (if known)
 - `title`
 - `year`
 **Optional fields**:
 - `howpublished`: Repository, website, format
 - `url`: URL
 - `doi`: DOI
 - `note`: Additional information
 - `month`: Month
 **Template for preprints**:
 ```bibtex
@misc{CitationKey2024,
  author       = {Last, First},
  title        = {Preprint Title},
  year         = {2024},
  howpublished = {bioRxiv},
  doi          = {10.1101/2024.01.01.123456},
  note         = {Preprint}
 }
 ```
 **Template for datasets**:
 ```bibtex
@misc{DatasetName2024,
  author       = {Last, First},
  title        = {Dataset Title},
  year         = {2024},
  howpublished = {Zenodo},
  doi          = {10.5281/zenodo.123456},
  note         = {Version 1.2}
 }
 ```
 **Template for software**:
 ```bibtex
@misc{SoftwareName2024,
  author       = {Last, First},
  title        = {Software Name},
  year         = {2024},
  howpublished = {GitHub},
  url          = {https://github.com/user/repo},
  note         = {Version 2.0}
 }
 ```
 ### @techreport - Technical Reports
 **For technical reports**.
 **Required fields**:
 - `author`: Author name(s)
 - `title`: Report title
 - `institution`: Institution
 - `year`: Year
 **Optional fields**:
 - `type`: Type of report
 - `number`: Report number
 - `address`: Institution location
 - `month`: Month
 **Template**:
 ```bibtex
@techreport{CitationKey2024,
  author      = {Last, First},
  title       = {Report Title},
  institution = {Institution Name},
  year        = {2024},
  type        = {Technical Report},
  number      = {TR-2024-01}
 }
 ```
 ### @unpublished - Unpublished Work
 **For unpublished works** (not preprints - use @misc for those).
 **Required fields**:
 - `author`: Author name(s)
 - `title`: Work title
 - `note`: Description
 **Optional fields**:
 - `month`: Month
 - `year`: Year
 **Template**:
 ```bibtex
@unpublished{CitationKey2024,
  author = {Last, First},
  title  = {Work Title},
  note   = {Unpublished manuscript},
  year   = {2024}
 }
 ```
 ### @online/@electronic - Online Resources
 **For web pages and online-only content**.
 **Note**: Not standard BibTeX, but supported by many bibliography packages (biblatex).
 **Required fields**:
 - `author` OR `organization`
 - `title`
 - `url`
 - `year`
 **Template**:
 ```bibtex
@online{CitationKey2024,
  author = {{Organization Name}},
  title  = {Page Title},
  url    = {https://example.com/page},
  year   = {2024},
  note   = {Accessed: 2024-01-15}
 }
 ```
 ## Formatting Rules
 ### Citation Keys
 **Convention**: `FirstAuthorYEARkeyword`
 **Examples**:
 ```bibtex
 Smith2024protein
 Doe2023machine
 JohnsonWilliams2024cancer  % Multiple authors, no space
 NatureEditorial2024        % No author, use publication
 WHO2024guidelines          % Organization author
 ```
 **Rules**:
 - Alphanumeric plus: `-`, `_`, `.`, `:`
 - No spaces
 - Case-sensitive
 - Unique within file
 - Descriptive
 **Avoid**:
 - Special characters: `@`, `#`, `&`, `%`, `$`
 - Spaces: use CamelCase or underscores
 - Starting with numbers: `2024Smith` (some systems disallow)
 ### Author Names
 **Recommended format**: `Last, First Middle`
 **Single author**:
 ```bibtex
 author = {Smith, John}
 author = {Smith, John A.}
 author = {Smith, John Andrew}
 ```
 **Multiple authors** - separate with `and`:
 ```bibtex
 author = {Smith, John and Doe, Jane}
 author = {Smith, John A. and Doe, Jane M. and Johnson, Mary L.}
 ```
 **Many authors** (10+):
 ```bibtex
 author = {Smith, John and Doe, Jane and Johnson, Mary and others}
 ```
 **Special cases**:
 ```bibtex
 % Suffix (Jr., III, etc.)
 author = {King, Jr., Martin Luther}
 % Organization as author
 author = {{World Health Organization}}
 % Note: Double braces keep as single entity
 % Multiple surnames
 author = {Garc{\'i}a-Mart{\'i}nez, Jos{\'e}}
 % Particles (van, von, de, etc.)
 author = {van der Waals, Johannes}
 author = {de Broglie, Louis}
 ```
 **Wrong formats** (don't use):
 ```bibtex
 author = {Smith, J.; Doe, J.}  % Semicolons (wrong)
 author = {Smith, J., Doe, J.}  % Commas (wrong)
 author = {Smith, J. & Doe, J.} % Ampersand (wrong)
 author = {Smith J}             % No comma
 ```
 ### Title Capitalization
 **Protect capitalization** with braces:
 ```bibtex
 % Proper nouns, acronyms, formulas
 title = {{AlphaFold}: Protein Structure Prediction}
 title = {Machine Learning for {DNA} Sequencing}
 title = {The {Ising} Model in Statistical Physics}
 title = {{CRISPR-Cas9} Gene Editing Technology}
 ```
 **Reason**: Citation styles may change capitalization. Braces protect.
 **Examples**:
 ```bibtex
 % Good
 title = {Advances in {COVID-19} Treatment}
 title = {Using {Python} for Data Analysis}
 title = {The {AlphaFold} Protein Structure Database}
 % Will be lowercase in title case styles
 title = {Advances in COVID-19 Treatment}  % covid-19
 title = {Using Python for Data Analysis}  % python
 ```
 **Whole title protection** (rarely needed):
 ```bibtex
 title = {{This Entire Title Keeps Its Capitalization}}
 ```
 ### Page Ranges
 **Use en-dash** (double hyphen `--`):
 ```bibtex
 pages = {123--145}     % Correct
 pages = {1234--1256}   % Correct
 pages = {e0123456}     % Article ID (PLOS, etc.)
 pages = {123}          % Single page
 ```
 **Wrong**:
 ```bibtex
 pages = {123-145}      % Single hyphen (don't use)
 pages = {pp. 123-145}  % "pp." not needed
 pages = {123–145}      % Unicode en-dash (may cause issues)
 ```
 ### Month Names
 **Use three-letter abbreviations** (unquoted):
 ```bibtex
 month = jan
 month = feb
 month = mar
 month = apr
 month = may
 month = jun
 month = jul
 month = aug
 month = sep
 month = oct
 month = nov
 month = dec
 ```
 **Or numeric**:
 ```bibtex
 month = {1}   % January
 month = {12}  % December
 ```
 **Or full name in braces**:
 ```bibtex
 month = {January}
 ```
 **Standard abbreviations work without quotes** because they're defined in BibTeX.
 ### Journal Names
 **Full name** (not abbreviated):
 ```bibtex
 journal = {Nature}
 journal = {Science}
 journal = {Cell}
 journal = {Proceedings of the National Academy of Sciences}
 journal = {Journal of the American Chemical Society}
 ```
 **Bibliography style** will handle abbreviation if needed.
 **Avoid manual abbreviation**:
 ```bibtex
 % Don't do this in BibTeX file
 journal = {Proc. Natl. Acad. Sci. U.S.A.}
 % Do this instead
 journal = {Proceedings of the National Academy of Sciences}
 ```
 **Exception**: If style requires abbreviations, use full abbreviated form:
 ```bibtex
 journal = {Proc. Natl. Acad. Sci. U.S.A.}  % If required by style
 ```
 ### DOI Formatting
 **URL format** (preferred):
 ```bibtex
 doi = {10.1038/s41586-021-03819-2}
 ```
 **Not**:
 ```bibtex
 doi = {https://doi.org/10.1038/s41586-021-03819-2}  % Don't include URL
 doi = {doi:10.1038/s41586-021-03819-2}              % Don't include prefix
 ```
 **LaTeX** will format as URL automatically.
 **Note**: No period after DOI field!
 ### URL Formatting
 ```bibtex
 url = {https://www.example.com/article}
 ```
 **Use**:
 - When DOI not available
 - For web pages
 - For supplementary materials
 **Don't duplicate**:
 ```bibtex
 % Don't include both if DOI URL is same as url
 doi = {10.1038/nature12345}
 url = {https://doi.org/10.1038/nature12345}  % Redundant!
 ```
 ### Special Characters
 **Accents and diacritics**:
 ```bibtex
 author = {M{\"u}ller, Hans}        % ü
 author = {Garc{\'i}a, Jos{\'e}}    % í, é
 author = {Erd{\H{o}}s, Paul}       % ő
 author = {Schr{\"o}dinger, Erwin}  % ö
 ```
 **Or use UTF-8** (with proper LaTeX setup):
 ```bibtex
 author = {Müller, Hans}
 author = {García, José}
 ```
 **Mathematical symbols**:
 ```bibtex
 title = {The $\alpha$-helix Structure}
 title = {$\beta$-sheet Prediction}
 ```
 **Chemical formulas**:
 ```bibtex
 title = {H$_2$O Molecular Dynamics}
 % Or with chemformula package:
 title = {\ce{H2O} Molecular Dynamics}
 ```
 ### Field Order
 **Recommended order** (for readability):
 ```bibtex
@article{Key,
  author  = {},
  title   = {},
  journal = {},
  year    = {},
  volume  = {},
  number  = {},
  pages   = {},
  doi     = {},
  url     = {},
  note    = {}
 }
 ```
 **Rules**:
 - Most important fields first
 - Consistent across entries
 - Use formatter to standardize
 ## Best Practices
 ### 1. Consistent Formatting
 Use same format throughout:
 - Author name format
 - Title capitalization
 - Journal names
 - Citation key style
 ### 2. Required Fields
 Always include:
 - All required fields for entry type
 - DOI for modern papers (2000+)
 - Volume and pages for articles
 - Publisher for books
 ### 3. Protect Capitalization
 Use braces for:
 - Proper nouns: `{AlphaFold}`
 - Acronyms: `{DNA}`, `{CRISPR}`
 - Formulas: `{H2O}`
 - Names: `{Python}`, `{R}`
 ### 4. Complete Author Lists
 Include all authors when possible:
 - All authors if <10
 - Use "and others" for 10+
 - Don't abbreviate to "et al." manually
 ### 5. Use Standard Entry Types
 Choose correct entry type:
 - Journal article → `@article`
 - Book → `@book`
 - Conference paper → `@inproceedings`
 - Preprint → `@misc`
 ### 6. Validate Syntax
 Check for:
 - Balanced braces
 - Commas after fields
 - Unique citation keys
 - Valid entry types
 ### 7. Use Formatters
 Use automated tools:
 ```bash
 python scripts/format_bibtex.py references.bib
 ```
 Benefits:
 - Consistent formatting
 - Catch syntax errors
 - Standardize field order
 - Fix common issues
 ## Common Mistakes
 ### 1. Wrong Author Separator
 **Wrong**:
 ```bibtex
 author = {Smith, J.; Doe, J.}    % Semicolon
 author = {Smith, J., Doe, J.}    % Comma
 author = {Smith, J. & Doe, J.}   % Ampersand
 ```
 **Correct**:
 ```bibtex
 author = {Smith, John and Doe, Jane}
 ```
 ### 2. Missing Commas
 **Wrong**:
 ```bibtex
@article{Smith2024,
  author = {Smith, John}    % Missing comma!
  title = {Title}
 }
 ```
 **Correct**:
 ```bibtex
@article{Smith2024,
  author = {Smith, John},   % Comma after each field
  title = {Title}
 }
 ```
 ### 3. Unprotected Capitalization
 **Wrong**:
 ```bibtex
 title = {Machine Learning with Python}
 % "Python" will become "python" in title case
 ```
 **Correct**:
 ```bibtex
 title = {Machine Learning with {Python}}
 ```
 ### 4. Single Hyphen in Pages
 **Wrong**:
 ```bibtex
 pages = {123-145}   % Single hyphen
 ```
 **Correct**:
 ```bibtex
 pages = {123--145}  % Double hyphen (en-dash)
 ```
 ### 5. Redundant "pp." in Pages
 **Wrong**:
 ```bibtex
 pages = {pp. 123--145}
 ```
 **Correct**:
 ```bibtex
 pages = {123--145}
 ```
 ### 6. DOI with URL Prefix
 **Wrong**:
 ```bibtex
 doi = {https://doi.org/10.1038/nature12345}
 doi = {doi:10.1038/nature12345}
 ```
 **Correct**:
 ```bibtex
 doi = {10.1038/nature12345}
 ```
 ## Example Complete Bibliography
 ```bibtex
 % Journal article
@article{Jumper2021,
  author  = {Jumper, John and Evans, Richard and Pritzel, Alexander and others},
  title   = {Highly Accurate Protein Structure Prediction with {AlphaFold}},
  journal = {Nature},
  year    = {2021},
  volume  = {596},
  number  = {7873},
  pages   = {583--589},
  doi     = {10.1038/s41586-021-03819-2}
 }
 % Book
@book{Kumar2021,
  author    = {Kumar, Vinay and Abbas, Abul K. and Aster, Jon C.},
  title     = {Robbins and Cotran Pathologic Basis of Disease},
  publisher = {Elsevier},
  year      = {2021},
  edition   = {10},
  address   = {Philadelphia, PA},
  isbn      = {978-0-323-53113-9}
 }
 % Conference paper
@inproceedings{Vaswani2017,
  author    = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and others},
  title     = {Attention is All You Need},
  booktitle = {Advances in Neural Information Processing Systems 30 (NeurIPS 2017)},
  year      = {2017},
  pages     = {5998--6008}
 }
 % Book chapter
@incollection{Brown2020,
  author    = {Brown, Peter O. and Botstein, David},
  title     = {Exploring the New World of the Genome with {DNA} Microarrays},
  booktitle = {DNA Microarrays: A Molecular Cloning Manual},
  editor    = {Eisen, Michael B. and Brown, Patrick O.},
  publisher = {Cold Spring Harbor Laboratory Press},
  year      = {2020},
  pages     = {1--45}
 }
 % PhD thesis
@phdthesis{Johnson2023,
  author  = {Johnson, Mary L.},
  title   = {Novel Approaches to Cancer Immunotherapy},
  school  = {Stanford University},
  year    = {2023},
  type    = {{PhD} dissertation}
 }
 % Preprint
@misc{Zhang2024,
  author       = {Zhang, Yi and Chen, Li and Wang, Hui},
  title        = {Novel Therapeutic Targets in {Alzheimer}'s Disease},
  year         = {2024},
  howpublished = {bioRxiv},
  doi          = {10.1101/2024.01.001},
  note         = {Preprint}
 }
 % Dataset
@misc{AlphaFoldDB2021,
  author       = {{DeepMind} and {EMBL-EBI}},
  title        = {{AlphaFold} Protein Structure Database},
  year         = {2021},
  howpublished = {Database},
  url          = {https://alphafold.ebi.ac.uk/},
  doi          = {10.1093/nar/gkab1061}
 }
 ```
 ## Summary
 BibTeX formatting essentials:
 ✓ **Choose correct entry type** (@article, @book, etc.)  
 ✓ **Include all required fields**  
 ✓ **Use `and` for multiple authors**  
 ✓ **Protect capitalization** with braces  
 ✓ **Use `--` for page ranges**  
 ✓ **Include DOI** for modern papers  
 ✓ **Validate syntax** before compilation  
 Use formatting tools to ensure consistency:
 ```bash
 python scripts/format_bibtex.py references.bib
 ```
 Properly formatted BibTeX ensures correct, consistent citations across all bibliography styles!
--- a/references/citation_validation.md
+++ b/references/citation_validation.md
@@ -0,0 +1,794 @@
 # Citation Validation Guide
 Comprehensive guide to validating citation accuracy, completeness, and formatting in BibTeX files.
 ## Overview
 Citation validation ensures:
 - All citations are accurate and complete
 - DOIs resolve correctly
 - Required fields are present
 - No duplicate entries
 - Proper formatting and syntax
 - Links are accessible
 Validation should be performed:
 - After extracting metadata
 - Before manuscript submission
 - After manual edits to BibTeX files
 - Periodically for maintained bibliographies
 ## Validation Categories
 ### 1. DOI Verification
 **Purpose**: Ensure DOIs are valid and resolve correctly.
 #### What to Check
 **DOI format**:
 ```
 Valid:   10.1038/s41586-021-03819-2
 Valid:   10.1126/science.aam9317
 Invalid: 10.1038/invalid
 Invalid: doi:10.1038/... (should omit "doi:" prefix in BibTeX)
 ```
 **DOI resolution**:
 - DOI should resolve via https://doi.org/
 - Should redirect to actual article
 - Should not return 404 or error
 **Metadata consistency**:
 - CrossRef metadata should match BibTeX
 - Author names should align
 - Title should match
 - Year should match
 #### How to Validate
 **Manual check**:
 1. Copy DOI from BibTeX
 2. Visit https://doi.org/10.1038/nature12345
 3. Verify it redirects to correct article
 4. Check metadata matches
 **Automated check** (recommended):
 ```bash
 python scripts/validate_citations.py references.bib --check-dois
 ```
 **Process**:
 1. Extract all DOIs from BibTeX file
 2. Query doi.org resolver for each
 3. Query CrossRef API for metadata
 4. Compare metadata with BibTeX entry
 5. Report discrepancies
 #### Common Issues
 **Broken DOIs**:
 - Typos in DOI
 - Publisher changed DOI (rare)
 - Article retracted
 - Solution: Find correct DOI from publisher site
 **Mismatched metadata**:
 - BibTeX has old/incorrect information
 - Solution: Re-extract metadata from CrossRef
 **Missing DOIs**:
 - Older articles may not have DOIs
 - Acceptable for pre-2000 publications
 - Add URL or PMID instead
 ### 2. Required Fields
 **Purpose**: Ensure all necessary information is present.
 #### Required by Entry Type
 **@article**:
 ```bibtex
 author   % REQUIRED
 title    % REQUIRED
 journal  % REQUIRED
 year     % REQUIRED
 volume   % Highly recommended
 pages    % Highly recommended
 doi      % Highly recommended for modern papers
 ```
 **@book**:
 ```bibtex
 author OR editor  % REQUIRED (at least one)
 title            % REQUIRED
 publisher        % REQUIRED
 year             % REQUIRED
 isbn             % Recommended
 ```
 **@inproceedings**:
 ```bibtex
 author     % REQUIRED
 title      % REQUIRED
 booktitle  % REQUIRED (conference/proceedings name)
 year       % REQUIRED
 pages      % Recommended
 ```
 **@incollection** (book chapter):
 ```bibtex
 author     % REQUIRED
 title      % REQUIRED (chapter title)
 booktitle  % REQUIRED (book title)
 publisher  % REQUIRED
 year       % REQUIRED
 editor     % Recommended
 pages      % Recommended
 ```
 **@phdthesis**:
 ```bibtex
 author  % REQUIRED
 title   % REQUIRED
 school  % REQUIRED
 year    % REQUIRED
 ```
 **@misc** (preprints, datasets, etc.):
 ```bibtex
 author  % REQUIRED
 title   % REQUIRED
 year    % REQUIRED
 howpublished  % Recommended (bioRxiv, Zenodo, etc.)
 doi OR url    % At least one required
 ```
 #### Validation Script
 ```bash
 python scripts/validate_citations.py references.bib --check-required-fields
 ```
 **Output**:
 ```
 Error: Entry 'Smith2024' missing required field 'journal'
 Error: Entry 'Doe2023' missing required field 'year'
 Warning: Entry 'Jones2022' missing recommended field 'volume'
 ```
 ### 3. Author Name Formatting
 **Purpose**: Ensure consistent, correct author name formatting.
 #### Proper Format
 **Recommended BibTeX format**:
 ```bibtex
 author = {Last1, First1 and Last2, First2 and Last3, First3}
 ```
 **Examples**:
 ```bibtex
 % Correct
 author = {Smith, John}
 author = {Smith, John A.}
 author = {Smith, John Andrew}
 author = {Smith, John and Doe, Jane}
 author = {Smith, John and Doe, Jane and Johnson, Mary}
 % For many authors
 author = {Smith, John and Doe, Jane and others}
 % Incorrect
 author = {John Smith}  % First Last format (not recommended)
 author = {Smith, J.; Doe, J.}  % Semicolon separator (wrong)
 author = {Smith J, Doe J}  % Missing commas
 ```
 #### Special Cases
 **Suffixes (Jr., III, etc.)**:
 ```bibtex
 author = {King, Jr., Martin Luther}
 ```
 **Multiple surnames (hyphenated)**:
 ```bibtex
 author = {Smith-Jones, Mary}
 ```
 **Van, von, de, etc.**:
 ```bibtex
 author = {van der Waals, Johannes}
 author = {de Broglie, Louis}
 ```
 **Organizations as authors**:
 ```bibtex
 author = {{World Health Organization}}
 % Double braces treat as single author
 ```
 #### Validation Checks
 **Automated validation**:
 ```bash
 python scripts/validate_citations.py references.bib --check-authors
 ```
 **Checks for**:
 - Proper separator (and, not &, ; , etc.)
 - Comma placement
 - Empty author fields
 - Malformed names
 ### 4. Data Consistency
 **Purpose**: Ensure all fields contain valid, reasonable values.
 #### Year Validation
 **Valid years**:
 ```bibtex
 year = {2024}    % Current/recent
 year = {1953}    % Watson & Crick DNA structure (historical)
 year = {1665}    % Hooke's Micrographia (very old)
 ```
 **Invalid years**:
 ```bibtex
 year = {24}      % Two digits (ambiguous)
 year = {202}     % Typo
 year = {2025}    % Future (unless accepted/in press)
 year = {0}       % Obviously wrong
 ```
 **Check**:
 - Four digits
 - Reasonable range (1600-current+1)
 - Not all zeros
 #### Volume/Number Validation
 ```bibtex
 volume = {123}      % Numeric
 volume = {12}       % Valid
 number = {3}        % Valid
 number = {S1}       % Supplement issue (valid)
 ```
 **Invalid**:
 ```bibtex
 volume = {Vol. 123}  % Should be just number
 number = {Issue 3}   % Should be just number
 ```
 #### Page Range Validation
 **Correct format**:
 ```bibtex
 pages = {123--145}    % En-dash (two hyphens)
 pages = {e0123456}    % PLOS-style article ID
 pages = {123}         % Single page
 ```
 **Incorrect format**:
 ```bibtex
 pages = {123-145}     % Single hyphen (use --)
 pages = {pp. 123-145} % Remove "pp."
 pages = {123–145}     % Unicode en-dash (may cause issues)
 ```
 #### URL Validation
 **Check**:
 - URLs are accessible (return 200 status)
 - HTTPS when available
 - No obvious typos
 - Permanent links (not temporary)
 **Valid**:
 ```bibtex
 url = {https://www.nature.com/articles/nature12345}
 url = {https://arxiv.org/abs/2103.14030}
 ```
 **Questionable**:
 ```bibtex
 url = {http://...}  % HTTP instead of HTTPS
 url = {file:///...} % Local file path
 url = {bit.ly/...}  % URL shortener (not permanent)
 ```
 ### 5. Duplicate Detection
 **Purpose**: Find and remove duplicate entries.
 #### Types of Duplicates
 **Exact duplicates** (same DOI):
 ```bibtex
@article{Smith2024a,
  doi = {10.1038/nature12345},
  ...
 }
@article{Smith2024b,
  doi = {10.1038/nature12345},  % Same DOI!
  ...
 }
 ```
 **Near duplicates** (similar title/authors):
 ```bibtex
@article{Smith2024,
  title = {Machine Learning for Drug Discovery},
  ...
 }
@article{Smith2024method,
  title = {Machine learning for drug discovery},  % Same, different case
  ...
 }
 ```
 **Preprint + Published**:
 ```bibtex
@misc{Smith2023arxiv,
  title = {AlphaFold Results},
  howpublished = {arXiv},
  ...
 }
@article{Smith2024,
  title = {AlphaFold Results},  % Same paper, now published
  journal = {Nature},
  ...
 }
 % Keep published version only
 ```
 #### Detection Methods
 **By DOI** (most reliable):
 - Same DOI = exact duplicate
 - Keep one, remove other
 **By title similarity**:
 - Normalize: lowercase, remove punctuation
 - Calculate similarity (e.g., Levenshtein distance)
 - Flag if >90% similar
 **By author-year-title**:
 - Same first author + year + similar title
 - Likely duplicate
 **Automated detection**:
 ```bash
 python scripts/validate_citations.py references.bib --check-duplicates
 ```
 **Output**:
 ```
 Warning: Possible duplicate entries:
  - Smith2024a (DOI: 10.1038/nature12345)
  - Smith2024b (DOI: 10.1038/nature12345)
  Recommendation: Keep one entry, remove the other.
 ```
 ### 6. Format and Syntax
 **Purpose**: Ensure valid BibTeX syntax.
 #### Common Syntax Errors
 **Missing commas**:
 ```bibtex
@article{Smith2024,
  author = {Smith, John}   % Missing comma!
  title = {Title}
 }
 % Should be:
  author = {Smith, John},  % Comma after each field
 ```
 **Unbalanced braces**:
 ```bibtex
 title = {Title with {Protected} Text  % Missing closing brace
 % Should be:
 title = {Title with {Protected} Text}
 ```
 **Missing closing brace for entry**:
 ```bibtex
@article{Smith2024,
  author = {Smith, John},
  title = {Title}
  % Missing closing brace!
 % Should end with:
 }
 ```
 **Invalid characters in keys**:
 ```bibtex
@article{Smith&Doe2024,  % & not allowed in key
  ...
 }
 % Use:
@article{SmithDoe2024,
  ...
 }
 ```
 #### BibTeX Syntax Rules
 **Entry structure**:
 ```bibtex
@TYPE{citationkey,
  field1 = {value1},
  field2 = {value2},
  ...
  fieldN = {valueN}
 }
 ```
 **Citation keys**:
 - Alphanumeric and some punctuation (-, _, ., :)
 - No spaces
 - Case-sensitive
 - Unique within file
 **Field values**:
 - Enclosed in {braces} or "quotes"
 - Braces preferred for complex text
 - Numbers can be unquoted: `year = 2024`
 **Special characters**:
 - `{` and `}` for grouping
 - `\` for LaTeX commands
 - Protect capitalization: `{AlphaFold}`
 - Accents: `{\"u}`, `{\'e}`, `{\aa}`
 #### Validation
 ```bash
 python scripts/validate_citations.py references.bib --check-syntax
 ```
 **Checks**:
 - Valid BibTeX structure
 - Balanced braces
 - Proper commas
 - Valid entry types
 - Unique citation keys
 ## Validation Workflow
 ### Step 1: Basic Validation
 Run comprehensive validation:
 ```bash
 python scripts/validate_citations.py references.bib
 ```
 **Checks all**:
 - DOI resolution
 - Required fields
 - Author formatting
 - Data consistency
 - Duplicates
 - Syntax
 ### Step 2: Review Report
 Examine validation report:
 ```json
 {
  "total_entries": 150,
  "valid_entries": 140,
  "errors": [
    {
      "entry": "Smith2024",
      "error": "missing_required_field",
      "field": "journal",
      "severity": "high"
    },
    {
      "entry": "Doe2023",
      "error": "invalid_doi",
      "doi": "10.1038/broken",
      "severity": "high"
    }
  ],
  "warnings": [
    {
      "entry": "Jones2022",
      "warning": "missing_recommended_field",
      "field": "volume",
      "severity": "medium"
    }
  ],
  "duplicates": [
    {
      "entries": ["Smith2024a", "Smith2024b"],
      "reason": "same_doi",
      "doi": "10.1038/nature12345"
    }
  ]
 }
 ```
 ### Step 3: Fix Issues
 **High-priority** (errors):
 1. Add missing required fields
 2. Fix broken DOIs
 3. Remove duplicates
 4. Correct syntax errors
 **Medium-priority** (warnings):
 1. Add recommended fields
 2. Improve author formatting
 3. Fix page ranges
 **Low-priority**:
 1. Standardize formatting
 2. Add URLs for accessibility
 ### Step 4: Auto-Fix
 Use auto-fix for safe corrections:
 ```bash
 python scripts/validate_citations.py references.bib \
  --auto-fix \
  --output fixed_references.bib
 ```
 **Auto-fix can**:
 - Fix page range format (- to --)
 - Remove "pp." from pages
 - Standardize author separators
 - Fix common syntax errors
 - Normalize field order
 **Auto-fix cannot**:
 - Add missing information
 - Find correct DOIs
 - Determine which duplicate to keep
 - Fix semantic errors
 ### Step 5: Manual Review
 Review auto-fixed file:
 ```bash
 # Check what changed
 diff references.bib fixed_references.bib
 # Review specific entries that had errors
 grep -A 10 "Smith2024" fixed_references.bib
 ```
 ### Step 6: Re-Validate
 Validate after fixes:
 ```bash
 python scripts/validate_citations.py fixed_references.bib --verbose
 ```
 Should show:
 ```
 ✓ All DOIs valid
 ✓ All required fields present
 ✓ No duplicates found
 ✓ Syntax valid
 ✓ 150/150 entries valid
 ```
 ## Validation Checklist
 Use this checklist before final submission:
 ### DOI Validation
 - [ ] All DOIs resolve correctly
 - [ ] Metadata matches between BibTeX and CrossRef
 - [ ] No broken or invalid DOIs
 ### Completeness
 - [ ] All entries have required fields
 - [ ] Modern papers (2000+) have DOIs
 - [ ] Authors properly formatted
 - [ ] Journals/conferences properly named
 ### Consistency
 - [ ] Years are 4-digit numbers
 - [ ] Page ranges use -- not -
 - [ ] Volume/number are numeric
 - [ ] URLs are accessible
 ### Duplicates
 - [ ] No entries with same DOI
 - [ ] No near-duplicate titles
 - [ ] Preprints updated to published versions
 ### Formatting
 - [ ] Valid BibTeX syntax
 - [ ] Balanced braces
 - [ ] Proper commas
 - [ ] Unique citation keys
 ### Final Checks
 - [ ] Bibliography compiles without errors
 - [ ] All citations in text appear in bibliography
 - [ ] All bibliography entries cited in text
 - [ ] Citation style matches journal requirements
 ## Best Practices
 ### 1. Validate Early and Often
 ```bash
 # After extraction
 python scripts/extract_metadata.py --doi ... --output refs.bib
 python scripts/validate_citations.py refs.bib
 # After manual edits
 python scripts/validate_citations.py refs.bib
 # Before submission
 python scripts/validate_citations.py refs.bib --strict
 ```
 ### 2. Use Automated Tools
 Don't validate manually - use scripts:
 - Faster
 - More comprehensive
 - Catches errors humans miss
 - Generates reports
 ### 3. Keep Backup
 ```bash
 # Before auto-fix
 cp references.bib references_backup.bib
 # Run auto-fix
 python scripts/validate_citations.py references.bib \
  --auto-fix \
  --output references_fixed.bib
 # Review changes
 diff references.bib references_fixed.bib
 # If satisfied, replace
 mv references_fixed.bib references.bib
 ```
 ### 4. Fix High-Priority First
 **Priority order**:
 1. Syntax errors (prevent compilation)
 2. Missing required fields (incomplete citations)
 3. Broken DOIs (broken links)
 4. Duplicates (confusion, wasted space)
 5. Missing recommended fields
 6. Formatting inconsistencies
 ### 5. Document Exceptions
 For entries that can't be fixed:
 ```bibtex
@article{Old1950,
  author = {Smith, John},
  title = {Title},
  journal = {Obscure Journal},
  year = {1950},
  volume = {12},
  pages = {34--56},
  note = {DOI not available for publications before 2000}
 }
 ```
 ### 6. Validate Against Journal Requirements
 Different journals have different requirements:
 - Citation style (numbered, author-year)
 - Abbreviations (journal names)
 - Maximum reference count
 - Format (BibTeX, EndNote, manual)
 Check journal author guidelines!
 ## Common Validation Issues
 ### Issue 1: Metadata Mismatch
 **Problem**: BibTeX says 2023, CrossRef says 2024.
 **Cause**:
 - Online-first vs print publication
 - Correction/update
 - Extraction error
 **Solution**:
 1. Check actual article
 2. Use more recent/accurate date
 3. Update BibTeX entry
 4. Re-validate
 ### Issue 2: Special Characters
 **Problem**: LaTeX compilation fails on special characters.
 **Cause**:
 - Accented characters (é, ü, ñ)
 - Chemical formulas (H₂O)
 - Math symbols (α, β, ±)
 **Solution**:
 ```bibtex
 % Use LaTeX commands
 author = {M{\"u}ller, Hans}  % Müller
 title = {Study of H\textsubscript{2}O}  % H₂O
 % Or use UTF-8 with proper LaTeX packages
 ```
 ### Issue 3: Incomplete Extraction
 **Problem**: Extracted metadata missing fields.
 **Cause**:
 - Source doesn't provide all metadata
 - Extraction error
 - Incomplete record
 **Solution**:
 1. Check original article
 2. Manually add missing fields
 3. Use alternative source (PubMed vs CrossRef)
 ### Issue 4: Cannot Find Duplicate
 **Problem**: Same paper appears twice, not detected.
 **Cause**:
 - Different DOIs (should be rare)
 - Different titles (abbreviated, typo)
 - Different citation keys
 **Solution**:
 - Manual search for author + year
 - Check for similar titles
 - Remove manually
 ## Summary
 Validation ensures citation quality:
 ✓ **Accuracy**: DOIs resolve, metadata correct  
 ✓ **Completeness**: All required fields present  
 ✓ **Consistency**: Proper formatting throughout  
 ✓ **No duplicates**: Each paper cited once  
 ✓ **Valid syntax**: BibTeX compiles without errors  
 **Always validate** before final submission!
 Use automated tools:
 ```bash
 python scripts/validate_citations.py references.bib
 ```
 Follow workflow:
 1. Extract metadata
 2. Validate
 3. Fix errors
 4. Re-validate
 5. Submit
--- a/references/google_scholar_search.md
+++ b/references/google_scholar_search.md
@@ -0,0 +1,725 @@
 # Google Scholar Search Guide
 Comprehensive guide to searching Google Scholar for academic papers, including advanced search operators, filtering strategies, and metadata extraction.
 ## Overview
 Google Scholar provides the most comprehensive coverage of academic literature across all disciplines:
 - **Coverage**: 100+ million scholarly documents
 - **Scope**: All academic disciplines
 - **Content types**: Journal articles, books, theses, conference papers, preprints, patents, court opinions
 - **Citation tracking**: "Cited by" links for forward citation tracking
 - **Accessibility**: Free to use, no account required
 ## Basic Search
 ### Simple Keyword Search
 Search for papers containing specific terms anywhere in the document (title, abstract, full text):
 ```
 CRISPR gene editing
 machine learning protein folding
 climate change impact agriculture
 quantum computing algorithms
 ```
 **Tips**:
 - Use specific technical terms
 - Include key acronyms and abbreviations
 - Start broad, then refine
 - Check spelling of technical terms
 ### Exact Phrase Search
 Use quotation marks to search for exact phrases:
 ```
 "deep learning"
 "CRISPR-Cas9"
 "systematic review"
 "randomized controlled trial"
 ```
 **When to use**:
 - Technical terms that must appear together
 - Proper names
 - Specific methodologies
 - Exact titles
 ## Advanced Search Operators
 ### Author Search
 Find papers by specific authors:
 ```
 author:LeCun
 author:"Geoffrey Hinton"
 author:Church synthetic biology
 ```
 **Variations**:
 - Single last name: `author:Smith`
 - Full name in quotes: `author:"Jane Smith"`
 - Author + topic: `author:Doudna CRISPR`
 **Tips**:
 - Authors may publish under different name variations
 - Try with and without middle initials
 - Consider name changes (marriage, etc.)
 - Use quotation marks for full names
 ### Title Search
 Search only in article titles:
 ```
 intitle:transformer
 intitle:"attention mechanism"
 intitle:review climate change
 ```
 **Use cases**:
 - Finding papers specifically about a topic
 - More precise than full-text search
 - Reduces irrelevant results
 - Good for finding reviews or methods
 ### Source (Journal) Search
 Search within specific journals or conferences:
 ```
 source:Nature
 source:"Nature Communications"
 source:NeurIPS
 source:"Journal of Machine Learning Research"
 ```
 **Applications**:
 - Track publications in top-tier venues
 - Find papers in specialized journals
 - Identify conference-specific work
 - Verify publication venue
 ### Exclusion Operator
 Exclude terms from results:
 ```
 machine learning -survey
 CRISPR -patent
 climate change -news
 deep learning -tutorial -review
 ```
 **Common exclusions**:
 - `-survey`: Exclude survey papers
 - `-review`: Exclude review articles
 - `-patent`: Exclude patents
 - `-book`: Exclude books
 - `-news`: Exclude news articles
 - `-tutorial`: Exclude tutorials
 ### OR Operator
 Search for papers containing any of multiple terms:
 ```
 "machine learning" OR "deep learning"
 CRISPR OR "gene editing"
 "climate change" OR "global warming"
 ```
 **Best practices**:
 - OR must be uppercase
 - Combine synonyms
 - Include acronyms and spelled-out versions
 - Use with exact phrases
 ### Wildcard Search
 Use asterisk (*) as wildcard for unknown words:
 ```
 "machine * learning"
 "CRISPR * editing"
 "* neural network"
 ```
 **Note**: Limited wildcard support in Google Scholar compared to other databases.
 ## Advanced Filtering
 ### Year Range
 Filter by publication year:
 **Using interface**:
 - Click "Since [year]" on left sidebar
 - Select custom range
 **Using search operators**:
 ```
 # Not directly in search query
 # Use interface or URL parameters
 ```
 **In script**:
 ```bash
 python scripts/search_google_scholar.py "quantum computing" \
  --year-start 2020 \
  --year-end 2024
 ```
 ### Sorting Options
 **By relevance** (default):
 - Google's algorithm determines relevance
 - Considers citations, author reputation, publication venue
 - Generally good for most searches
 **By date**:
 - Most recent papers first
 - Good for fast-moving fields
 - May miss highly cited older papers
 - Click "Sort by date" in interface
 **By citation count** (via script):
 ```bash
 python scripts/search_google_scholar.py "transformers" \
  --sort-by citations \
  --limit 50
 ```
 ### Language Filtering
 **In interface**:
 - Settings → Languages
 - Select preferred languages
 **Default**: English and papers with English abstracts
 ## Search Strategies
 ### Finding Seminal Papers
 Identify highly influential papers in a field:
 1. **Search by topic** with broad terms
 2. **Sort by citations** (most cited first)
 3. **Look for review articles** for comprehensive overviews
 4. **Check publication dates** for foundational vs recent work
 **Example**:
 ```
 "generative adversarial networks"
 # Sort by citations
 # Top results: original GAN paper (Goodfellow et al., 2014), key variants
 ```
 ### Finding Recent Work
 Stay current with latest research:
 1. **Search by topic**
 2. **Filter to recent years** (last 1-2 years)
 3. **Sort by date** for newest first
 4. **Set up alerts** for ongoing tracking
 **Example**:
 ```bash
 python scripts/search_google_scholar.py "AlphaFold protein structure" \
  --year-start 2023 \
  --year-end 2024 \
  --limit 50
 ```
 ### Finding Review Articles
 Get comprehensive overviews of a field:
 ```
 intitle:review "machine learning"
 "systematic review" CRISPR
 intitle:survey "natural language processing"
 ```
 **Indicators**:
 - "review", "survey", "perspective" in title
 - Often highly cited
 - Published in review journals (Nature Reviews, Trends, etc.)
 - Comprehensive reference lists
 ### Citation Chain Search
 **Forward citations** (papers citing a key paper):
 1. Find seminal paper
 2. Click "Cited by X"
 3. See all papers that cite it
 4. Identify how field has developed
 **Backward citations** (references in a key paper):
 1. Find recent review or important paper
 2. Check its reference list
 3. Identify foundational work
 4. Trace development of ideas
 **Example workflow**:
 ```
 # Find original transformer paper
 "Attention is all you need" author:Vaswani
 # Check "Cited by 120,000+"
 # See evolution: BERT, GPT, T5, etc.
 # Check references in original paper
 # Find RNN, LSTM, attention mechanism origins
 ```
 ### Comprehensive Literature Search
 For thorough coverage (e.g., systematic reviews):
 1. **Generate synonym list**:
   - Main terms + alternatives
   - Acronyms + spelled out
   - US vs UK spelling
 2. **Use OR operators**:
   ```
   ("machine learning" OR "deep learning" OR "neural networks")
   ```
 3. **Combine multiple concepts**:
   ```
   ("machine learning" OR "deep learning") ("drug discovery" OR "drug development")
   ```
 4. **Search without date filters** initially:
   - Get total landscape
   - Filter later if too many results
 5. **Export results** for systematic analysis:
   ```bash
   python scripts/search_google_scholar.py \
     '"machine learning" OR "deep learning" drug discovery' \
     --limit 500 \
     --output comprehensive_search.json
   ```
 ## Extracting Citation Information
 ### From Google Scholar Results Page
 Each result shows:
 - **Title**: Paper title (linked to full text if available)
 - **Authors**: Author list (often truncated)
 - **Source**: Journal/conference, year, publisher
 - **Cited by**: Number of citations + link to citing papers
 - **Related articles**: Link to similar papers
 - **All versions**: Different versions of the same paper
 ### Export Options
 **Manual export**:
 1. Click "Cite" under paper
 2. Select BibTeX format
 3. Copy citation
 **Limitations**:
 - One paper at a time
 - Manual process
 - Time-consuming for many papers
 **Automated export** (using script):
 ```bash
 # Search and export to BibTeX
 python scripts/search_google_scholar.py "quantum computing" \
  --limit 50 \
  --format bibtex \
  --output quantum_papers.bib
 ```
 ### Metadata Available
 From Google Scholar you can typically extract:
 - Title
 - Authors (may be incomplete)
 - Year
 - Source (journal/conference)
 - Citation count
 - Link to full text (when available)
 - Link to PDF (when available)
 **Note**: Metadata quality varies:
 - Some fields may be missing
 - Author names may be incomplete
 - Need to verify with DOI lookup for accuracy
 ## Rate Limiting and Access
 ### Rate Limits
 Google Scholar has rate limiting to prevent automated scraping:
 **Symptoms of rate limiting**:
 - CAPTCHA challenges
 - Temporary IP blocks
 - 429 "Too Many Requests" errors
 **Best practices**:
 1. **Add delays between requests**: 2-5 seconds minimum
 2. **Limit query volume**: Don't search hundreds of queries rapidly
 3. **Use scholarly library**: Handles rate limiting automatically
 4. **Rotate User-Agents**: Appear as different browsers
 5. **Consider proxies**: For large-scale searches (use ethically)
 **In our scripts**:
 ```python
 # Automatic rate limiting built in
 time.sleep(random.uniform(3, 7))  # Random delay 3-7 seconds
 ```
 ### Ethical Considerations
 **DO**:
 - Respect rate limits
 - Use reasonable delays
 - Cache results (don't re-query)
 - Use official APIs when available
 - Attribute data properly
 **DON'T**:
 - Scrape aggressively
 - Use multiple IPs to bypass limits
 - Violate terms of service
 - Burden servers unnecessarily
 - Use data commercially without permission
 ### Institutional Access
 **Benefits of institutional access**:
 - Access to full-text PDFs through library subscriptions
 - Better download capabilities
 - Integration with library systems
 - Link resolver to full text
 **Setup**:
 - Google Scholar → Settings → Library links
 - Add your institution
 - Links appear in search results
 ## Tips and Best Practices
 ### Search Optimization
 1. **Start simple, then refine**:
   ```
   # Too specific initially
   intitle:"deep learning" intitle:review source:Nature 2023..2024
   # Better approach
   deep learning review
   # Review results
   # Add intitle:, source:, year filters as needed
   ```
 2. **Use multiple search strategies**:
   - Keyword search
   - Author search for known experts
   - Citation chaining from key papers
   - Source search in top journals
 3. **Check spelling and variations**:
   - Color vs colour
   - Optimization vs optimisation
   - Tumor vs tumour
   - Try common misspellings if few results
 4. **Combine operators strategically**:
   ```
   # Good combination
   author:Church intitle:"synthetic biology" 2015..2024
   # Find reviews by specific author on topic in recent years
   ```
 ### Result Evaluation
 1. **Check citation counts**:
   - High citations indicate influence
   - Recent papers may have low citations but be important
   - Citation counts vary by field
 2. **Verify publication venue**:
   - Peer-reviewed journals vs preprints
   - Conference proceedings
   - Book chapters
   - Technical reports
 3. **Check for full text access**:
   - [PDF] link on right side
   - "All X versions" may have open access version
   - Check institutional access
   - Try author's website or ResearchGate
 4. **Look for review articles**:
   - Comprehensive overviews
   - Good starting point for new topics
   - Extensive reference lists
 ### Managing Results
 1. **Use citation manager integration**:
   - Export to BibTeX
   - Import to Zotero, Mendeley, EndNote
   - Maintain organized library
 2. **Set up alerts** for ongoing research:
   - Google Scholar → Alerts
   - Get emails for new papers matching query
   - Track specific authors or topics
 3. **Create collections**:
   - Save papers to Google Scholar Library
   - Organize by project or topic
   - Add labels and notes
 4. **Export systematically**:
   ```bash
   # Save search results for later analysis
   python scripts/search_google_scholar.py "your topic" \
     --output topic_papers.json
   # Can re-process later without re-searching
   python scripts/extract_metadata.py \
     --input topic_papers.json \
     --output topic_refs.bib
   ```
 ## Advanced Techniques
 ### Boolean Logic Combinations
 Combine multiple operators for precise searches:
 ```
 # Highly cited reviews on specific topic by known authors
 intitle:review "machine learning" ("drug discovery" OR "drug development")
 author:Horvath OR author:Bengio 2020..2024
 # Method papers excluding reviews
 intitle:method "protein folding" -review -survey
 # Papers in top journals only
 ("Nature" OR "Science" OR "Cell") CRISPR 2022..2024
 ```
 ### Finding Open Access Papers
 ```
 # Search with generic terms
 machine learning
 # Filter by "All versions" which often includes preprints
 # Look for green [PDF] links (often open access)
 # Check arXiv, bioRxiv versions
 ```
 **In script**:
 ```bash
 python scripts/search_google_scholar.py "topic" \
  --open-access-only \
  --output open_access_papers.json
 ```
 ### Tracking Research Impact
 **For a specific paper**:
 1. Find the paper
 2. Click "Cited by X"
 3. Analyze citing papers:
   - How is it being used?
   - What fields cite it?
   - Recent vs older citations?
 **For an author**:
 1. Search `author:LastName`
 2. Check h-index and i10-index
 3. View citation history graph
 4. Identify most influential papers
 **For a topic**:
 1. Search topic
 2. Sort by citations
 3. Identify seminal papers (highly cited, older)
 4. Check recent highly-cited papers (emerging important work)
 ### Finding Preprints and Early Work
 ```
 # arXiv papers
 source:arxiv "deep learning"
 # bioRxiv papers
 source:biorxiv CRISPR
 # All preprint servers
 ("arxiv" OR "biorxiv" OR "medrxiv") your topic
 ```
 **Note**: Preprints are not peer-reviewed. Always check if published version exists.
 ## Common Issues and Solutions
 ### Too Many Results
 **Problem**: Search returns 100,000+ results, overwhelming.
 **Solutions**:
 1. Add more specific terms
 2. Use `intitle:` to search only titles
 3. Filter by recent years
 4. Add exclusions (e.g., `-review`)
 5. Search within specific journals
 ### Too Few Results
 **Problem**: Search returns 0-10 results, suspiciously few.
 **Solutions**:
 1. Remove restrictive operators
 2. Try synonyms and related terms
 3. Check spelling
 4. Broaden year range
 5. Use OR for alternative terms
 ### Irrelevant Results
 **Problem**: Results don't match intent.
 **Solutions**:
 1. Use exact phrases with quotes
 2. Add more specific context terms
 3. Use `intitle:` for title-only search
 4. Exclude common irrelevant terms
 5. Combine multiple specific terms
 ### CAPTCHA or Rate Limiting
 **Problem**: Google Scholar shows CAPTCHA or blocks access.
 **Solutions**:
 1. Wait several minutes before continuing
 2. Reduce query frequency
 3. Use longer delays in scripts (5-10 seconds)
 4. Switch to different IP/network
 5. Consider using institutional access
 ### Missing Metadata
 **Problem**: Author names, year, or venue missing from results.
 **Solutions**:
 1. Click through to see full details
 2. Check "All versions" for better metadata
 3. Look up by DOI if available
 4. Extract metadata from CrossRef/PubMed instead
 5. Manually verify from paper PDF
 ### Duplicate Results
 **Problem**: Same paper appears multiple times.
 **Solutions**:
 1. Click "All X versions" to see consolidated view
 2. Choose version with best metadata
 3. Use deduplication in post-processing:
   ```bash
   python scripts/format_bibtex.py results.bib \
     --deduplicate \
     --output clean_results.bib
   ```
 ## Integration with Scripts
 ### search_google_scholar.py Usage
 **Basic search**:
 ```bash
 python scripts/search_google_scholar.py "machine learning drug discovery"
 ```
 **With year filter**:
 ```bash
 python scripts/search_google_scholar.py "CRISPR" \
  --year-start 2020 \
  --year-end 2024 \
  --limit 100
 ```
 **Sort by citations**:
 ```bash
 python scripts/search_google_scholar.py "transformers" \
  --sort-by citations \
  --limit 50
 ```
 **Export to BibTeX**:
 ```bash
 python scripts/search_google_scholar.py "quantum computing" \
  --format bibtex \
  --output quantum.bib
 ```
 **Export to JSON for later processing**:
 ```bash
 python scripts/search_google_scholar.py "topic" \
  --format json \
  --output results.json
 # Later: extract full metadata
 python scripts/extract_metadata.py \
  --input results.json \
  --output references.bib
 ```
 ### Batch Searching
 For multiple topics:
 ```bash
 # Create file with search queries (queries.txt)
 # One query per line
 # Search each query
 while read query; do
  python scripts/search_google_scholar.py "$query" \
    --limit 50 \
    --output "${query// /_}.json"
  sleep 10  # Delay between queries
 done < queries.txt
 ```
 ## Summary
 Google Scholar is the most comprehensive academic search engine, providing:
 ✓ **Broad coverage**: All disciplines, 100M+ documents  
 ✓ **Free access**: No account or subscription required  
 ✓ **Citation tracking**: "Cited by" for impact analysis  
 ✓ **Multiple formats**: Articles, books, theses, patents  
 ✓ **Full-text search**: Not just abstracts  
 Key strategies:
 - Use advanced operators for precision
 - Combine author, title, source searches
 - Track citations for impact
 - Export systematically to citation manager
 - Respect rate limits and access policies
 - Verify metadata with CrossRef/PubMed
 For biomedical research, complement with PubMed for MeSH terms and curated metadata.
--- a/references/metadata_extraction.md
+++ b/references/metadata_extraction.md
@@ -0,0 +1,870 @@
 # Metadata Extraction Guide
 Comprehensive guide to extracting accurate citation metadata from DOIs, PMIDs, arXiv IDs, and URLs using various APIs and services.
 ## Overview
 Accurate metadata is essential for proper citations. This guide covers:
 - Identifying paper identifiers (DOI, PMID, arXiv ID)
 - Querying metadata APIs (CrossRef, PubMed, arXiv, DataCite)
 - Required BibTeX fields by entry type
 - Handling edge cases and special situations
 - Validating extracted metadata
 ## Paper Identifiers
 ### DOI (Digital Object Identifier)
 **Format**: `10.XXXX/suffix`
 **Examples**:
 ```
 10.1038/s41586-021-03819-2    # Nature article
 10.1126/science.aam9317       # Science article
 10.1016/j.cell.2023.01.001    # Cell article
 10.1371/journal.pone.0123456  # PLOS ONE article
 ```
 **Properties**:
 - Permanent identifier
 - Most reliable for metadata
 - Resolves to current location
 - Publisher-assigned
 **Where to find**:
 - First page of article
 - Article webpage
 - CrossRef, Google Scholar, PubMed
 - Usually prominent on publisher site
 ### PMID (PubMed ID)
 **Format**: 8-digit number (typically)
 **Examples**:
 ```
 34265844
 28445112
 35476778
 ```
 **Properties**:
 - Specific to PubMed database
 - Biomedical literature only
 - Assigned by NCBI
 - Permanent identifier
 **Where to find**:
 - PubMed search results
 - Article page on PubMed
 - Often in article PDF footer
 - PMC (PubMed Central) pages
 ### PMCID (PubMed Central ID)
 **Format**: PMC followed by numbers
 **Examples**:
 ```
 PMC8287551
 PMC7456789
 ```
 **Properties**:
 - Free full-text articles in PMC
 - Subset of PubMed articles
 - Open access or author manuscripts
 ### arXiv ID
 **Format**: YYMM.NNNNN or archive/YYMMNNN
 **Examples**:
 ```
 2103.14030        # New format (since 2007)
 2401.12345        # 2024 submission
 arXiv:hep-th/9901001  # Old format
 ```
 **Properties**:
 - Preprints (not peer-reviewed)
 - Physics, math, CS, q-bio, etc.
 - Version tracking (v1, v2, etc.)
 - Free, open access
 **Where to find**:
 - arXiv.org
 - Often cited before publication
 - Paper PDF header
 ### Other Identifiers
 **ISBN** (Books):
 ```
 978-0-12-345678-9
 0-123-45678-9
 ```
 **arXiv category**:
 ```
 cs.LG    # Computer Science - Machine Learning
 q-bio.QM # Quantitative Biology - Quantitative Methods
 math.ST  # Mathematics - Statistics
 ```
 ## Metadata APIs
 ### CrossRef API
 **Primary source for DOIs** - Most comprehensive metadata for journal articles.
 **Base URL**: `https://api.crossref.org/works/`
 **No API key required**, but polite pool recommended:
 - Add email to User-Agent
 - Gets better service
 - No rate limits
 #### Basic DOI Lookup
 **Request**:
 ```
 GET https://api.crossref.org/works/10.1038/s41586-021-03819-2
 ```
 **Response** (simplified):
 ```json
 {
  "message": {
    "DOI": "10.1038/s41586-021-03819-2",
    "title": ["Article title here"],
    "author": [
      {"given": "John", "family": "Smith"},
      {"given": "Jane", "family": "Doe"}
    ],
    "container-title": ["Nature"],
    "volume": "595",
    "issue": "7865",
    "page": "123-128",
    "published-print": {"date-parts": [[2021, 7, 1]]},
    "publisher": "Springer Nature",
    "type": "journal-article",
    "ISSN": ["0028-0836"]
  }
 }
 ```
 #### Fields Available
 **Always present**:
 - `DOI`: Digital Object Identifier
 - `title`: Article title (array)
 - `type`: Content type (journal-article, book-chapter, etc.)
 **Usually present**:
 - `author`: Array of author objects
 - `container-title`: Journal/book title
 - `published-print` or `published-online`: Publication date
 - `volume`, `issue`, `page`: Publication details
 - `publisher`: Publisher name
 **Sometimes present**:
 - `abstract`: Article abstract
 - `subject`: Subject categories
 - `ISSN`: Journal ISSN
 - `ISBN`: Book ISBN
 - `reference`: Reference list
 - `is-referenced-by-count`: Citation count
 #### Content Types
 CrossRef `type` field values:
 - `journal-article`: Journal articles
 - `book-chapter`: Book chapters
 - `book`: Books
 - `proceedings-article`: Conference papers
 - `posted-content`: Preprints
 - `dataset`: Research datasets
 - `report`: Technical reports
 - `dissertation`: Theses/dissertations
 ### PubMed E-utilities API
 **Specialized for biomedical literature** - Curated metadata with MeSH terms.
 **Base URL**: `https://eutils.ncbi.nlm.nih.gov/entrez/eutils/`
 **API key recommended** (free):
 - Higher rate limits
 - Better performance
 #### PMID to Metadata
 **Step 1: EFetch for full record**
 ```
 GET https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?
  db=pubmed&
  id=34265844&
  retmode=xml&
  api_key=YOUR_KEY
 ```
 **Response**: XML with comprehensive metadata
 **Step 2: Parse XML**
 Key fields:
 ```xml
 <PubmedArticle>
  <MedlineCitation>
    <PMID>34265844</PMID>
    <Article>
      <ArticleTitle>Title here</ArticleTitle>
      <AuthorList>
        <Author><LastName>Smith</LastName><ForeName>John</ForeName></Author>
      </AuthorList>
      <Journal>
        <Title>Nature</Title>
        <JournalIssue>
          <Volume>595</Volume>
          <Issue>7865</Issue>
          <PubDate><Year>2021</Year></PubDate>
        </JournalIssue>
      </Journal>
      <Pagination><MedlinePgn>123-128</MedlinePgn></Pagination>
      <Abstract><AbstractText>Abstract text here</AbstractText></Abstract>
    </Article>
  </MedlineCitation>
  <PubmedData>
    <ArticleIdList>
      <ArticleId IdType="doi">10.1038/s41586-021-03819-2</ArticleId>
      <ArticleId IdType="pmc">PMC8287551</ArticleId>
    </ArticleIdList>
  </PubmedData>
 </PubmedArticle>
 ```
 #### Unique PubMed Fields
 **MeSH Terms**: Controlled vocabulary
 ```xml
 <MeshHeadingList>
  <MeshHeading>
    <DescriptorName UI="D003920">Diabetes Mellitus</DescriptorName>
  </MeshHeading>
 </MeshHeadingList>
 ```
 **Publication Types**:
 ```xml
 <PublicationTypeList>
  <PublicationType UI="D016428">Journal Article</PublicationType>
  <PublicationType UI="D016449">Randomized Controlled Trial</PublicationType>
 </PublicationTypeList>
 ```
 **Grant Information**:
 ```xml
 <GrantList>
  <Grant>
    <GrantID>R01-123456</GrantID>
    <Agency>NIAID NIH HHS</Agency>
    <Country>United States</Country>
  </Grant>
 </GrantList>
 ```
 ### arXiv API
 **Preprints in physics, math, CS, q-bio** - Free, open access.
 **Base URL**: `http://export.arxiv.org/api/query`
 **No API key required**
 #### arXiv ID to Metadata
 **Request**:
 ```
 GET http://export.arxiv.org/api/query?id_list=2103.14030
 ```
 **Response**: Atom XML
 ```xml
 <entry>
  <id>http://arxiv.org/abs/2103.14030v2</id>
  <title>Highly accurate protein structure prediction with AlphaFold</title>
  <author><name>John Jumper</name></author>
  <author><name>Richard Evans</name></author>
  <published>2021-03-26T17:47:17Z</published>
  <updated>2021-07-01T16:51:46Z</updated>
  <summary>Abstract text here...</summary>
  <arxiv:doi>10.1038/s41586-021-03819-2</arxiv:doi>
  <category term="q-bio.BM" scheme="http://arxiv.org/schemas/atom"/>
  <category term="cs.LG" scheme="http://arxiv.org/schemas/atom"/>
 </entry>
 ```
 #### Key Fields
 - `id`: arXiv URL
 - `title`: Preprint title
 - `author`: Author list
 - `published`: First version date
 - `updated`: Latest version date
 - `summary`: Abstract
 - `arxiv:doi`: DOI if published
 - `arxiv:journal_ref`: Journal reference if published
 - `category`: arXiv categories
 #### Version Tracking
 arXiv tracks versions:
 - `v1`: Initial submission
 - `v2`, `v3`, etc.: Revisions
 **Always check** if preprint has been published in journal (use DOI if available).
 ### DataCite API
 **Research datasets, software, other outputs** - Assigns DOIs to non-traditional scholarly works.
 **Base URL**: `https://api.datacite.org/dois/`
 **Similar to CrossRef** but for datasets, software, code, etc.
 **Request**:
 ```
 GET https://api.datacite.org/dois/10.5281/zenodo.1234567
 ```
 **Response**: JSON with metadata for dataset/software
 ## Required BibTeX Fields
 ### @article (Journal Articles)
 **Required**:
 - `author`: Author names
 - `title`: Article title
 - `journal`: Journal name
 - `year`: Publication year
 **Optional but recommended**:
 - `volume`: Volume number
 - `number`: Issue number
 - `pages`: Page range (e.g., 123--145)
 - `doi`: Digital Object Identifier
 - `url`: URL if no DOI
 - `month`: Publication month
 **Example**:
 ```bibtex
@article{Smith2024,
  author  = {Smith, John and Doe, Jane},
  title   = {Novel Approach to Protein Folding},
  journal = {Nature},
  year    = {2024},
  volume  = {625},
  number  = {8001},
  pages   = {123--145},
  doi     = {10.1038/nature12345}
 }
 ```
 ### @book (Books)
 **Required**:
 - `author` or `editor`: Author(s) or editor(s)
 - `title`: Book title
 - `publisher`: Publisher name
 - `year`: Publication year
 **Optional but recommended**:
 - `edition`: Edition number (if not first)
 - `address`: Publisher location
 - `isbn`: ISBN
 - `url`: URL
 - `series`: Series name
 **Example**:
 ```bibtex
@book{Kumar2021,
  author    = {Kumar, Vinay and Abbas, Abul K. and Aster, Jon C.},
  title     = {Robbins and Cotran Pathologic Basis of Disease},
  publisher = {Elsevier},
  year      = {2021},
  edition   = {10},
  isbn      = {978-0-323-53113-9}
 }
 ```
 ### @inproceedings (Conference Papers)
 **Required**:
 - `author`: Author names
 - `title`: Paper title
 - `booktitle`: Conference/proceedings name
 - `year`: Year
 **Optional but recommended**:
 - `pages`: Page range
 - `organization`: Organizing body
 - `publisher`: Publisher
 - `address`: Conference location
 - `month`: Conference month
 - `doi`: DOI if available
 **Example**:
 ```bibtex
@inproceedings{Vaswani2017,
  author    = {Vaswani, Ashish and Shazeer, Noam and others},
  title     = {Attention is All You Need},
  booktitle = {Advances in Neural Information Processing Systems},
  year      = {2017},
  pages     = {5998--6008},
  volume    = {30}
 }
 ```
 ### @incollection (Book Chapters)
 **Required**:
 - `author`: Chapter author(s)
 - `title`: Chapter title
 - `booktitle`: Book title
 - `publisher`: Publisher name
 - `year`: Publication year
 **Optional but recommended**:
 - `editor`: Book editor(s)
 - `pages`: Chapter page range
 - `chapter`: Chapter number
 - `edition`: Edition
 - `address`: Publisher location
 **Example**:
 ```bibtex
@incollection{Brown2020,
  author    = {Brown, Peter O. and Botstein, David},
  title     = {Exploring the New World of the Genome with {DNA} Microarrays},
  booktitle = {DNA Microarrays: A Molecular Cloning Manual},
  editor    = {Eisen, Michael B. and Brown, Patrick O.},
  publisher = {Cold Spring Harbor Laboratory Press},
  year      = {2020},
  pages     = {1--45}
 }
 ```
 ### @phdthesis (Dissertations)
 **Required**:
 - `author`: Author name
 - `title`: Thesis title
 - `school`: Institution
 - `year`: Year
 **Optional**:
 - `type`: Type (e.g., "PhD dissertation")
 - `address`: Institution location
 - `month`: Month
 - `url`: URL
 **Example**:
 ```bibtex
@phdthesis{Johnson2023,
  author = {Johnson, Mary L.},
  title  = {Novel Approaches to Cancer Immunotherapy},
  school = {Stanford University},
  year   = {2023},
  type   = {{PhD} dissertation}
 }
 ```
 ### @misc (Preprints, Software, Datasets)
 **Required**:
 - `author`: Author(s)
 - `title`: Title
 - `year`: Year
 **For preprints, add**:
 - `howpublished`: Repository (e.g., "bioRxiv")
 - `doi`: Preprint DOI
 - `note`: Preprint ID
 **Example (preprint)**:
 ```bibtex
@misc{Zhang2024,
  author       = {Zhang, Yi and Chen, Li and Wang, Hui},
  title        = {Novel Therapeutic Targets in Alzheimer's Disease},
  year         = {2024},
  howpublished = {bioRxiv},
  doi          = {10.1101/2024.01.001},
  note         = {Preprint}
 }
 ```
 **Example (software)**:
 ```bibtex
@misc{AlphaFold2021,
  author       = {DeepMind},
  title        = {{AlphaFold} Protein Structure Database},
  year         = {2021},
  howpublished = {Software},
  url          = {https://alphafold.ebi.ac.uk/},
  doi          = {10.5281/zenodo.5123456}
 }
 ```
 ## Extraction Workflows
 ### From DOI
 **Best practice** - Most reliable source:
 ```bash
 # Single DOI
 python scripts/extract_metadata.py --doi 10.1038/s41586-021-03819-2
 # Multiple DOIs
 python scripts/extract_metadata.py \
  --doi 10.1038/nature12345 \
  --doi 10.1126/science.abc1234 \
  --output refs.bib
 ```
 **Process**:
 1. Query CrossRef API with DOI
 2. Parse JSON response
 3. Extract required fields
 4. Determine entry type (@article, @book, etc.)
 5. Format as BibTeX
 6. Validate completeness
 ### From PMID
 **For biomedical literature**:
 ```bash
 # Single PMID
 python scripts/extract_metadata.py --pmid 34265844
 # Multiple PMIDs
 python scripts/extract_metadata.py \
  --pmid 34265844 \
  --pmid 28445112 \
  --output refs.bib
 ```
 **Process**:
 1. Query PubMed EFetch with PMID
 2. Parse XML response
 3. Extract metadata including MeSH terms
 4. Check for DOI in response
 5. If DOI exists, optionally query CrossRef for additional metadata
 6. Format as BibTeX
 ### From arXiv ID
 **For preprints**:
 ```bash
 python scripts/extract_metadata.py --arxiv 2103.14030
 ```
 **Process**:
 1. Query arXiv API with ID
 2. Parse Atom XML response
 3. Check for published version (DOI in response)
 4. If published: Use DOI and CrossRef
 5. If not published: Use preprint metadata
 6. Format as @misc with preprint note
 **Important**: Always check if preprint has been published!
 ### From URL
 **When you only have URL**:
 ```bash
 python scripts/extract_metadata.py \
  --url "https://www.nature.com/articles/s41586-021-03819-2"
 ```
 **Process**:
 1. Parse URL to extract identifier
 2. Identify type (DOI, PMID, arXiv)
 3. Extract identifier from URL
 4. Query appropriate API
 5. Format as BibTeX
 **URL patterns**:
 ```
 # DOI URLs
 https://doi.org/10.1038/nature12345
 https://dx.doi.org/10.1126/science.abc123
 https://www.nature.com/articles/s41586-021-03819-2
 # PubMed URLs
 https://pubmed.ncbi.nlm.nih.gov/34265844/
 https://www.ncbi.nlm.nih.gov/pubmed/34265844
 # arXiv URLs
 https://arxiv.org/abs/2103.14030
 https://arxiv.org/pdf/2103.14030.pdf
 ```
 ### Batch Processing
 **From file with mixed identifiers**:
 ```bash
 # Create file with one identifier per line
 # identifiers.txt:
 #   10.1038/nature12345
 #   34265844
 #   2103.14030
 #   https://doi.org/10.1126/science.abc123
 python scripts/extract_metadata.py \
  --input identifiers.txt \
  --output references.bib
 ```
 **Process**:
 - Script auto-detects identifier type
 - Queries appropriate API
 - Combines all into single BibTeX file
 - Handles errors gracefully
 ## Special Cases and Edge Cases
 ### Preprints Later Published
 **Issue**: Preprint cited, but journal version now available.
 **Solution**:
 1. Check arXiv metadata for DOI field
 2. If DOI present, use published version
 3. Update citation to journal article
 4. Note preprint version in comments if needed
 **Example**:
 ```bibtex
 % Originally: arXiv:2103.14030
 % Published as:
@article{Jumper2021,
  author  = {Jumper, John and Evans, Richard and others},
  title   = {Highly Accurate Protein Structure Prediction with {AlphaFold}},
  journal = {Nature},
  year    = {2021},
  volume  = {596},
  pages   = {583--589},
  doi     = {10.1038/s41586-021-03819-2}
 }
 ```
 ### Multiple Authors (et al.)
 **Issue**: Many authors (10+).
 **BibTeX practice**:
 - Include all authors if <10
 - Use "and others" for 10+
 - Or list all (journals vary)
 **Example**:
 ```bibtex
@article{LargeCollaboration2024,
  author = {First, Author and Second, Author and Third, Author and others},
  ...
 }
 ```
 ### Author Name Variations
 **Issue**: Authors publish under different name formats.
 **Standardization**:
 ```
 # Common variations
 John Smith
 John A. Smith
 John Andrew Smith
 J. A. Smith
 Smith, J.
 Smith, J. A.
 # BibTeX format (recommended)
 author = {Smith, John A.}
 ```
 **Extraction preference**:
 1. Use full name if available
 2. Include middle initial if available
 3. Format: Last, First Middle
 ### No DOI Available
 **Issue**: Older papers or books without DOIs.
 **Solutions**:
 1. Use PMID if available (biomedical)
 2. Use ISBN for books
 3. Use URL to stable source
 4. Include full publication details
 **Example**:
 ```bibtex
@article{OldPaper1995,
  author  = {Author, Name},
  title   = {Title Here},
  journal = {Journal Name},
  year    = {1995},
  volume  = {123},
  pages   = {45--67},
  url     = {https://stable-url-here},
  note    = {PMID: 12345678}
 }
 ```
 ### Conference Papers vs Journal Articles
 **Issue**: Same work published in both.
 **Best practice**:
 - Cite journal version if both available
 - Journal version is archival
 - Conference version for timeliness
 **If citing conference**:
 ```bibtex
@inproceedings{Smith2024conf,
  author    = {Smith, John},
  title     = {Title},
  booktitle = {Proceedings of NeurIPS 2024},
  year      = {2024}
 }
 ```
 **If citing journal**:
 ```bibtex
@article{Smith2024journal,
  author  = {Smith, John},
  title   = {Title},
  journal = {Journal of Machine Learning Research},
  year    = {2024}
 }
 ```
 ### Book Chapters vs Edited Collections
 **Extract correctly**:
 - Chapter: Use `@incollection`
 - Whole book: Use `@book`
 - Book editor: List in `editor` field
 - Chapter author: List in `author` field
 ### Datasets and Software
 **Use @misc** with appropriate fields:
 ```bibtex
@misc{DatasetName2024,
  author       = {Author, Name},
  title        = {Dataset Title},
  year         = {2024},
  howpublished = {Zenodo},
  doi          = {10.5281/zenodo.123456},
  note         = {Version 1.2}
 }
 ```
 ## Validation After Extraction
 Always validate extracted metadata:
 ```bash
 python scripts/validate_citations.py extracted_refs.bib
 ```
 **Check**:
 - All required fields present
 - DOI resolves correctly
 - Author names formatted consistently
 - Year is reasonable (4 digits)
 - Journal/publisher names correct
 - Page ranges use -- not -
 - Special characters handled properly
 ## Best Practices
 ### 1. Prefer DOI When Available
 DOIs provide:
 - Permanent identifier
 - Best metadata source
 - Publisher-verified information
 - Resolvable link
 ### 2. Verify Automatically Extracted Metadata
 Spot-check:
 - Author names match publication
 - Title matches (including capitalization)
 - Year is correct
 - Journal name is complete
 ### 3. Handle Special Characters
 **LaTeX special characters**:
 - Protect capitalization: `{AlphaFold}`
 - Handle accents: `M{\"u}ller` or use Unicode
 - Chemical formulas: `H$_2$O` or `\ce{H2O}`
 ### 4. Use Consistent Citation Keys
 **Convention**: `FirstAuthorYEARkeyword`
 ```
 Smith2024protein
 Doe2023machine
 Johnson2024cancer
 ```
 ### 5. Include DOI for Modern Papers
 All papers published after ~2000 should have DOI:
 ```bibtex
 doi = {10.1038/nature12345}
 ```
 ### 6. Document Source
 For non-standard sources, add note:
 ```bibtex
 note = {Preprint, not peer-reviewed}
 note = {Technical report}
 note = {Dataset accompanying [citation]}
 ```
 ## Summary
 Metadata extraction workflow:
 1. **Identify**: Determine identifier type (DOI, PMID, arXiv, URL)
 2. **Query**: Use appropriate API (CrossRef, PubMed, arXiv)
 3. **Extract**: Parse response for required fields
 4. **Format**: Create properly formatted BibTeX entry
 5. **Validate**: Check completeness and accuracy
 6. **Verify**: Spot-check critical citations
 **Use scripts** to automate:
 - `extract_metadata.py`: Universal extractor
 - `doi_to_bibtex.py`: Quick DOI conversion
 - `validate_citations.py`: Verify accuracy
 **Always validate** extracted metadata before final submission!
--- a/references/pubmed_search.md
+++ b/references/pubmed_search.md
@@ -0,0 +1,839 @@
 # PubMed Search Guide
 Comprehensive guide to searching PubMed for biomedical and life sciences literature, including MeSH terms, field tags, advanced search strategies, and E-utilities API usage.
 ## Overview
 PubMed is the premier database for biomedical literature:
 - **Coverage**: 35+ million citations
 - **Scope**: Biomedical and life sciences
 - **Sources**: MEDLINE, life science journals, online books
 - **Authority**: Maintained by National Library of Medicine (NLM) / NCBI
 - **Access**: Free, no account required
 - **Updates**: Daily with new citations
 - **Curation**: High-quality metadata, MeSH indexing
 ## Basic Search
 ### Simple Keyword Search
 PubMed automatically maps terms to MeSH and searches multiple fields:
 ```
 diabetes
 CRISPR gene editing
 Alzheimer's disease treatment
 cancer immunotherapy
 ```
 **Automatic Features**:
 - Automatic MeSH mapping
 - Plural/singular variants
 - Abbreviation expansion
 - Spell checking
 ### Exact Phrase Search
 Use quotation marks for exact phrases:
 ```
 "CRISPR-Cas9"
 "systematic review"
 "randomized controlled trial"
 "machine learning"
 ```
 ## MeSH (Medical Subject Headings)
 ### What is MeSH?
 MeSH is a controlled vocabulary thesaurus for indexing biomedical literature:
 - **Hierarchical structure**: Organized in tree structures
 - **Consistent indexing**: Same concept always tagged the same way
 - **Comprehensive**: Covers diseases, drugs, anatomy, techniques, etc.
 - **Professional curation**: NLM indexers assign MeSH terms
 ### Finding MeSH Terms
 **MeSH Browser**: https://meshb.nlm.nih.gov/search
 **Example**:
 ```
 Search: "heart attack"
 MeSH term: "Myocardial Infarction"
 ```
 **In PubMed**:
 1. Search with keyword
 2. Check "MeSH Terms" in left sidebar
 3. Select relevant MeSH terms
 4. Add to search
 ### Using MeSH in Searches
 **Basic MeSH search**:
 ```
 "Diabetes Mellitus"[MeSH]
 "CRISPR-Cas Systems"[MeSH]
 "Alzheimer Disease"[MeSH]
 "Neoplasms"[MeSH]
 ```
 **MeSH with subheadings**:
 ```
 "Diabetes Mellitus/drug therapy"[MeSH]
 "Neoplasms/genetics"[MeSH]
 "Heart Failure/prevention and control"[MeSH]
 ```
 **Common subheadings**:
 - `/drug therapy`: Drug treatment
 - `/diagnosis`: Diagnostic aspects
 - `/genetics`: Genetic aspects
 - `/epidemiology`: Occurrence and distribution
 - `/prevention and control`: Prevention methods
 - `/etiology`: Causes
 - `/surgery`: Surgical treatment
 - `/metabolism`: Metabolic aspects
 ### MeSH Explosion
 By default, MeSH searches include narrower terms (explosion):
 ```
 "Neoplasms"[MeSH]
 # Includes: Breast Neoplasms, Lung Neoplasms, etc.
 ```
 **Disable explosion** (exact term only):
 ```
 "Neoplasms"[MeSH:NoExp]
 ```
 ### MeSH Major Topic
 Search only where MeSH term is a major focus:
 ```
 "Diabetes Mellitus"[MeSH Major Topic]
 # Only papers where diabetes is main topic
 ```
 ## Field Tags
 Field tags specify which part of the record to search.
 ### Common Field Tags
 **Title and Abstract**:
 ```
 cancer[Title]                    # In title only
 treatment[Title/Abstract]        # In title or abstract
 "machine learning"[Title/Abstract]
 ```
 **Author**:
 ```
 "Smith J"[Author]
 "Doudna JA"[Author]
 "Collins FS"[Author]
 ```
 **Author - Full Name**:
 ```
 "Smith, John"[Full Author Name]
 ```
 **Journal**:
 ```
 "Nature"[Journal]
 "Science"[Journal]
 "New England Journal of Medicine"[Journal]
 "Nat Commun"[Journal]           # Abbreviated form
 ```
 **Publication Date**:
 ```
 2023[Publication Date]
 2020:2024[Publication Date]      # Date range
 2023/01/01:2023/12/31[Publication Date]
 ```
 **Date Created**:
 ```
 2023[Date - Create]              # When added to PubMed
 ```
 **Publication Type**:
 ```
 "Review"[Publication Type]
 "Clinical Trial"[Publication Type]
 "Meta-Analysis"[Publication Type]
 "Randomized Controlled Trial"[Publication Type]
 ```
 **Language**:
 ```
 English[Language]
 French[Language]
 ```
 **DOI**:
 ```
 10.1038/nature12345[DOI]
 ```
 **PMID (PubMed ID)**:
 ```
 12345678[PMID]
 ```
 **Article ID**:
 ```
 PMC1234567[PMC]                  # PubMed Central ID
 ```
 ### Less Common But Useful Tags
 ```
 humans[MeSH Terms]               # Only human studies
 animals[MeSH Terms]              # Only animal studies
 "United States"[Place of Publication]
 nih[Grant Number]                # NIH-funded research
 "Female"[Sex]                    # Female subjects
 "Aged, 80 and over"[Age]        # Elderly subjects
 ```
 ## Boolean Operators
 Combine search terms with Boolean logic.
 ### AND
 Both terms must be present (default behavior):
 ```
 diabetes AND treatment
 "CRISPR-Cas9" AND "gene editing"
 cancer AND immunotherapy AND "clinical trial"[Publication Type]
 ```
 ### OR
 Either term must be present:
 ```
 "heart attack" OR "myocardial infarction"
 diabetes OR "diabetes mellitus"
 CRISPR OR Cas9 OR "gene editing"
 ```
 **Use case**: Synonyms and related terms
 ### NOT
 Exclude terms:
 ```
 cancer NOT review
 diabetes NOT animal
 "machine learning" NOT "deep learning"
 ```
 **Caution**: May exclude relevant papers that mention both terms.
 ### Combining Operators
 Use parentheses for complex logic:
 ```
 (diabetes OR "diabetes mellitus") AND (treatment OR therapy)
 ("CRISPR" OR "gene editing") AND ("therapeutic" OR "therapy") 
  AND 2020:2024[Publication Date]
 (cancer OR neoplasm) AND (immunotherapy OR "immune checkpoint inhibitor") 
  AND ("clinical trial"[Publication Type] OR "randomized controlled trial"[Publication Type])
 ```
 ## Advanced Search Builder
 **Access**: https://pubmed.ncbi.nlm.nih.gov/advanced/
 **Features**:
 - Visual query builder
 - Add multiple query boxes
 - Select field tags from dropdowns
 - Combine with AND/OR/NOT
 - Preview results
 - Shows final query string
 - Save queries
 **Workflow**:
 1. Add search terms in separate boxes
 2. Select field tags
 3. Choose Boolean operators
 4. Preview results
 5. Refine as needed
 6. Copy final query string
 7. Use in scripts or save
 **Example built query**:
 ```
 #1: "Diabetes Mellitus, Type 2"[MeSH]
 #2: "Metformin"[MeSH]
 #3: "Clinical Trial"[Publication Type]
 #4: 2020:2024[Publication Date]
 #5: #1 AND #2 AND #3 AND #4
 ```
 ## Filters and Limits
 ### Article Types
 ```
 "Review"[Publication Type]
 "Systematic Review"[Publication Type]
 "Meta-Analysis"[Publication Type]
 "Clinical Trial"[Publication Type]
 "Randomized Controlled Trial"[Publication Type]
 "Case Reports"[Publication Type]
 "Comparative Study"[Publication Type]
 ```
 ### Species
 ```
 humans[MeSH Terms]
 mice[MeSH Terms]
 rats[MeSH Terms]
 ```
 ### Sex
 ```
 "Female"[MeSH Terms]
 "Male"[MeSH Terms]
 ```
 ### Age Groups
 ```
 "Infant"[MeSH Terms]
 "Child"[MeSH Terms]
 "Adolescent"[MeSH Terms]
 "Adult"[MeSH Terms]
 "Aged"[MeSH Terms]
 "Aged, 80 and over"[MeSH Terms]
 ```
 ### Text Availability
 ```
 free full text[Filter]           # Free full-text available
 ```
 ### Journal Categories
 ```
 "Journal Article"[Publication Type]
 ```
 ## E-utilities API
 NCBI provides programmatic access via E-utilities (Entrez Programming Utilities).
 ### Overview
 **Base URL**: `https://eutils.ncbi.nlm.nih.gov/entrez/eutils/`
 **Main Tools**:
 - **ESearch**: Search and retrieve PMIDs
 - **EFetch**: Retrieve full records
 - **ESummary**: Retrieve document summaries
 - **ELink**: Find related articles
 - **EInfo**: Database statistics
 **No API key required**, but recommended for:
 - Higher rate limits (10/sec vs 3/sec)
 - Better performance
 - Identify your project
 **Get API key**: https://www.ncbi.nlm.nih.gov/account/
 ### ESearch - Search PubMed
 Retrieve PMIDs for a query.
 **Endpoint**: `/esearch.fcgi`
 **Parameters**:
 - `db`: Database (pubmed)
 - `term`: Search query
 - `retmax`: Maximum results (default 20, max 10000)
 - `retstart`: Starting position (for pagination)
 - `sort`: Sort order (relevance, pub_date, author)
 - `api_key`: Your API key (optional but recommended)
 **Example URL**:
 ```
 https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?
  db=pubmed&
  term=diabetes+AND+treatment&
  retmax=100&
  retmode=json&
  api_key=YOUR_API_KEY
 ```
 **Response**:
 ```json
 {
  "esearchresult": {
    "count": "250000",
    "retmax": "100",
    "idlist": ["12345678", "12345679", ...]
  }
 }
 ```
 ### EFetch - Retrieve Records
 Get full metadata for PMIDs.
 **Endpoint**: `/efetch.fcgi`
 **Parameters**:
 - `db`: Database (pubmed)
 - `id`: Comma-separated PMIDs
 - `retmode`: Format (xml, json, text)
 - `rettype`: Type (abstract, medline, full)
 - `api_key`: Your API key
 **Example URL**:
 ```
 https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?
  db=pubmed&
  id=12345678,12345679&
  retmode=xml&
  api_key=YOUR_API_KEY
 ```
 **Response**: XML with complete metadata including:
 - Title
 - Authors (with affiliations)
 - Abstract
 - Journal
 - Publication date
 - DOI
 - PMID, PMCID
 - MeSH terms
 - Keywords
 ### ESummary - Get Summaries
 Lighter-weight alternative to EFetch.
 **Example**:
 ```
 https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?
  db=pubmed&
  id=12345678&
  retmode=json&
  api_key=YOUR_API_KEY
 ```
 **Returns**: Key metadata without full abstract and details.
 ### ELink - Find Related Articles
 Find related articles or links to other databases.
 **Example**:
 ```
 https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?
  dbfrom=pubmed&
  db=pubmed&
  id=12345678&
  linkname=pubmed_pubmed_citedin
 ```
 **Link types**:
 - `pubmed_pubmed`: Related articles
 - `pubmed_pubmed_citedin`: Papers citing this article
 - `pubmed_pmc`: PMC full-text versions
 - `pubmed_protein`: Related protein records
 ### Rate Limiting
 **Without API key**:
 - 3 requests per second
 - Block if exceeded
 **With API key**:
 - 10 requests per second
 - Better for programmatic access
 **Best practice**:
 ```python
 import time
 time.sleep(0.34)  # ~3 requests/second
 # or
 time.sleep(0.11)  # ~10 requests/second with API key
 ```
 ### API Key Usage
 **Get API key**:
 1. Create NCBI account: https://www.ncbi.nlm.nih.gov/account/
 2. Settings → API Key Management
 3. Create new API key
 4. Copy key
 **Use in requests**:
 ```
 &api_key=YOUR_API_KEY_HERE
 ```
 **Store securely**:
 ```bash
 # In environment variable
 export NCBI_API_KEY="your_key_here"
 # In script
 import os
 api_key = os.getenv('NCBI_API_KEY')
 ```
 ## Search Strategies
 ### Comprehensive Systematic Search
 For systematic reviews and meta-analyses:
 ```
 # 1. Identify key concepts
 Concept 1: Diabetes
 Concept 2: Treatment
 Concept 3: Outcomes
 # 2. Find MeSH terms and synonyms
 Concept 1: "Diabetes Mellitus"[MeSH] OR diabetes OR diabetic
 Concept 2: "Drug Therapy"[MeSH] OR treatment OR therapy OR medication
 Concept 3: "Treatment Outcome"[MeSH] OR outcome OR efficacy OR effectiveness
 # 3. Combine with AND
 ("Diabetes Mellitus"[MeSH] OR diabetes OR diabetic) 
  AND ("Drug Therapy"[MeSH] OR treatment OR therapy OR medication)
  AND ("Treatment Outcome"[MeSH] OR outcome OR efficacy OR effectiveness)
 # 4. Add filters
 AND 2015:2024[Publication Date]
 AND ("Clinical Trial"[Publication Type] OR "Randomized Controlled Trial"[Publication Type])
 AND English[Language]
 AND humans[MeSH Terms]
 ```
 ### Finding Clinical Trials
 ```
 # Specific disease + clinical trials
 "Alzheimer Disease"[MeSH] 
  AND ("Clinical Trial"[Publication Type] 
       OR "Randomized Controlled Trial"[Publication Type])
  AND 2020:2024[Publication Date]
 # Specific drug trials
 "Metformin"[MeSH] 
  AND "Diabetes Mellitus, Type 2"[MeSH]
  AND "Randomized Controlled Trial"[Publication Type]
 ```
 ### Finding Reviews
 ```
 # Systematic reviews on topic
 "CRISPR-Cas Systems"[MeSH] 
  AND ("Systematic Review"[Publication Type] OR "Meta-Analysis"[Publication Type])
 # Reviews in high-impact journals
 cancer immunotherapy 
  AND "Review"[Publication Type]
  AND ("Nature"[Journal] OR "Science"[Journal] OR "Cell"[Journal])
 ```
 ### Finding Recent Papers
 ```
 # Papers from last year
 "machine learning"[Title/Abstract] 
  AND "drug discovery"[Title/Abstract]
  AND 2024[Publication Date]
 # Recent papers in specific journal
 "CRISPR"[Title/Abstract] 
  AND "Nature"[Journal]
  AND 2023:2024[Publication Date]
 ```
 ### Author Tracking
 ```
 # Specific author's recent work
 "Doudna JA"[Author] AND 2020:2024[Publication Date]
 # Author + topic
 "Church GM"[Author] AND "synthetic biology"[Title/Abstract]
 ```
 ### High-Quality Evidence
 ```
 # Meta-analyses and systematic reviews
 (diabetes OR "diabetes mellitus") 
  AND (treatment OR therapy)
  AND ("Meta-Analysis"[Publication Type] OR "Systematic Review"[Publication Type])
 # RCTs only
 cancer immunotherapy 
  AND "Randomized Controlled Trial"[Publication Type]
  AND 2020:2024[Publication Date]
 ```
 ## Script Integration
 ### search_pubmed.py Usage
 **Basic search**:
 ```bash
 python scripts/search_pubmed.py "diabetes treatment"
 ```
 **With MeSH terms**:
 ```bash
 python scripts/search_pubmed.py \
  --query '"Diabetes Mellitus"[MeSH] AND "Drug Therapy"[MeSH]'
 ```
 **Date range filter**:
 ```bash
 python scripts/search_pubmed.py "CRISPR" \
  --date-start 2020-01-01 \
  --date-end 2024-12-31 \
  --limit 200
 ```
 **Publication type filter**:
 ```bash
 python scripts/search_pubmed.py "cancer immunotherapy" \
  --publication-types "Clinical Trial,Randomized Controlled Trial" \
  --limit 100
 ```
 **Export to BibTeX**:
 ```bash
 python scripts/search_pubmed.py "Alzheimer's disease" \
  --limit 100 \
  --format bibtex \
  --output alzheimers.bib
 ```
 **Complex query from file**:
 ```bash
 # Save complex query in query.txt
 cat > query.txt << 'EOF'
 ("Diabetes Mellitus, Type 2"[MeSH] OR "diabetes"[Title/Abstract])
 AND ("Metformin"[MeSH] OR "metformin"[Title/Abstract])
 AND "Randomized Controlled Trial"[Publication Type]
 AND 2015:2024[Publication Date]
 AND English[Language]
 EOF
 # Run search
 python scripts/search_pubmed.py --query-file query.txt --limit 500
 ```
 ### Batch Searches
 ```bash
 # Search multiple topics
 TOPICS=("diabetes treatment" "cancer immunotherapy" "CRISPR gene editing")
 for topic in "${TOPICS[@]}"; do
  python scripts/search_pubmed.py "$topic" \
    --limit 100 \
    --output "${topic// /_}.json"
  sleep 1
 done
 ```
 ### Extract Metadata
 ```bash
 # Search returns PMIDs
 python scripts/search_pubmed.py "topic" --output results.json
 # Extract full metadata
 python scripts/extract_metadata.py \
  --input results.json \
  --output references.bib
 ```
 ## Tips and Best Practices
 ### Search Construction
 1. **Start with MeSH terms**:
   - Use MeSH Browser to find correct terms
   - More precise than keyword search
   - Captures all papers on topic regardless of terminology
 2. **Include text word variants**:
   ```
   # Better coverage
   ("Diabetes Mellitus"[MeSH] OR diabetes OR diabetic)
   ```
 3. **Use field tags appropriately**:
   - `[MeSH]` for standardized concepts
   - `[Title/Abstract]` for specific terms
   - `[Author]` for known authors
   - `[Journal]` for specific venues
 4. **Build incrementally**:
   ```
   # Step 1: Basic search
   diabetes
   # Step 2: Add specificity
   "Diabetes Mellitus, Type 2"[MeSH]
   # Step 3: Add treatment
   "Diabetes Mellitus, Type 2"[MeSH] AND "Metformin"[MeSH]
   # Step 4: Add study type
   "Diabetes Mellitus, Type 2"[MeSH] AND "Metformin"[MeSH] 
     AND "Clinical Trial"[Publication Type]
   # Step 5: Add date range
   ... AND 2020:2024[Publication Date]
   ```
 ### Optimizing Results
 1. **Too many results**: Add filters
   - Restrict publication type
   - Narrow date range
   - Add more specific MeSH terms
   - Use Major Topic: `[MeSH Major Topic]`
 2. **Too few results**: Broaden search
   - Remove restrictive filters
   - Use OR for synonyms
   - Expand date range
   - Use MeSH explosion (default)
 3. **Irrelevant results**: Refine terms
   - Use more specific MeSH terms
   - Add exclusions with NOT
   - Use Title field instead of all fields
   - Add MeSH subheadings
 ### Quality Control
 1. **Document search strategy**:
   - Save exact query string
   - Record search date
   - Note number of results
   - Save filters used
 2. **Export systematically**:
   - Use consistent file naming
   - Export to JSON for flexibility
   - Convert to BibTeX as needed
   - Keep original search results
 3. **Validate retrieved citations**:
   ```bash
   python scripts/validate_citations.py pubmed_results.bib
   ```
 ### Staying Current
 1. **Set up search alerts**:
   - PubMed → Save search
   - Receive email updates
   - Daily, weekly, or monthly
 2. **Track specific journals**:
   ```
   "Nature"[Journal] AND CRISPR[Title]
   ```
 3. **Follow key authors**:
   ```
   "Church GM"[Author]
   ```
 ## Common Issues and Solutions
 ### Issue: MeSH Term Not Found
 **Solution**: 
 - Check spelling
 - Use MeSH Browser
 - Try related terms
 - Use text word search as fallback
 ### Issue: Zero Results
 **Solution**:
 - Remove filters
 - Check query syntax
 - Use OR for broader search
 - Try synonyms
 ### Issue: Poor Quality Results
 **Solution**:
 - Add publication type filters
 - Restrict to recent years
 - Use MeSH Major Topic
 - Filter by journal quality
 ### Issue: Duplicates from Different Sources
 **Solution**:
 ```bash
 python scripts/format_bibtex.py results.bib \
  --deduplicate \
  --output clean.bib
 ```
 ### Issue: API Rate Limiting
 **Solution**:
 - Get API key (increases limit to 10/sec)
 - Add delays in scripts
 - Process in batches
 - Use off-peak hours
 ## Summary
 PubMed provides authoritative biomedical literature search:
 ✓ **Curated content**: MeSH indexing, quality control  
 ✓ **Precise search**: Field tags, MeSH terms, filters  
 ✓ **Programmatic access**: E-utilities API  
 ✓ **Free access**: No subscription required  
 ✓ **Comprehensive**: 35M+ citations, daily updates  
 Key strategies:
 - Use MeSH terms for precise searching
 - Combine with text words for comprehensive coverage
 - Apply appropriate field tags
 - Filter by publication type and date
 - Use E-utilities API for automation
 - Document search strategy for reproducibility
 For broader coverage across disciplines, complement with Google Scholar.
--- a/scripts/doi_to_bibtex.py
+++ b/scripts/doi_to_bibtex.py
@@ -0,0 +1,204 @@
 #!/usr/bin/env python3
 """
 DOI to BibTeX Converter
 Quick utility to convert DOIs to BibTeX format using CrossRef API.
 """
 import sys
 import requests
 import argparse
 import time
 import json
 from typing import Optional, List
 class DOIConverter:
    """Convert DOIs to BibTeX entries using CrossRef API."""
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'DOIConverter/1.0 (Citation Management Tool; mailto:support@example.com)'
        })
    def doi_to_bibtex(self, doi: str) -> Optional[str]:
        """
        Convert a single DOI to BibTeX format.
        Args:
            doi: Digital Object Identifier
        Returns:
            BibTeX string or None if conversion fails
        """
        # Clean DOI (remove URL prefix if present)
        doi = doi.strip()
        if doi.startswith('https://doi.org/'):
            doi = doi.replace('https://doi.org/', '')
        elif doi.startswith('http://doi.org/'):
            doi = doi.replace('http://doi.org/', '')
        elif doi.startswith('doi:'):
            doi = doi.replace('doi:', '')
        # Request BibTeX from CrossRef content negotiation
        url = f'https://doi.org/{doi}'
        headers = {
            'Accept': 'application/x-bibtex',
            'User-Agent': 'DOIConverter/1.0 (Citation Management Tool)'
        }
        try:
            response = self.session.get(url, headers=headers, timeout=15)
            if response.status_code == 200:
                bibtex = response.text.strip()
                # CrossRef sometimes returns entries with @data type, convert to @misc
                if bibtex.startswith('@data{'):
                    bibtex = bibtex.replace('@data{', '@misc{', 1)
                return bibtex
            elif response.status_code == 404:
                print(f'Error: DOI not found: {doi}', file=sys.stderr)
                return None
            else:
                print(f'Error: Failed to retrieve BibTeX for {doi} (status {response.status_code})', file=sys.stderr)
                return None
        except requests.exceptions.Timeout:
            print(f'Error: Request timeout for DOI: {doi}', file=sys.stderr)
            return None
        except requests.exceptions.RequestException as e:
            print(f'Error: Request failed for {doi}: {e}', file=sys.stderr)
            return None
    def convert_multiple(self, dois: List[str], delay: float = 0.5) -> List[str]:
        """
        Convert multiple DOIs to BibTeX.
        Args:
            dois: List of DOIs
            delay: Delay between requests (seconds) for rate limiting
        Returns:
            List of BibTeX entries (excludes failed conversions)
        """
        bibtex_entries = []
        for i, doi in enumerate(dois):
            print(f'Converting DOI {i+1}/{len(dois)}: {doi}', file=sys.stderr)
            bibtex = self.doi_to_bibtex(doi)
            if bibtex:
                bibtex_entries.append(bibtex)
            # Rate limiting
            if i < len(dois) - 1:  # Don't delay after last request
                time.sleep(delay)
        return bibtex_entries
 def main():
    """Command-line interface."""
    parser = argparse.ArgumentParser(
        description='Convert DOIs to BibTeX format using CrossRef API',
        epilog='Example: python doi_to_bibtex.py 10.1038/s41586-021-03819-2'
    )
    parser.add_argument(
        'dois',
        nargs='*',
        help='DOI(s) to convert (can provide multiple)'
    )
    parser.add_argument(
        '-i', '--input',
        help='Input file with DOIs (one per line)'
    )
    parser.add_argument(
        '-o', '--output',
        help='Output file for BibTeX (default: stdout)'
    )
    parser.add_argument(
        '--delay',
        type=float,
        default=0.5,
        help='Delay between requests in seconds (default: 0.5)'
    )
    parser.add_argument(
        '--format',
        choices=['bibtex', 'json'],
        default='bibtex',
        help='Output format (default: bibtex)'
    )
    args = parser.parse_args()
    # Collect DOIs from command line and/or file
    dois = []
    if args.dois:
        dois.extend(args.dois)
    if args.input:
        try:
            with open(args.input, 'r', encoding='utf-8') as f:
                file_dois = [line.strip() for line in f if line.strip()]
                dois.extend(file_dois)
        except FileNotFoundError:
            print(f'Error: Input file not found: {args.input}', file=sys.stderr)
            sys.exit(1)
        except Exception as e:
            print(f'Error reading input file: {e}', file=sys.stderr)
            sys.exit(1)
    if not dois:
        parser.print_help()
        sys.exit(1)
    # Convert DOIs
    converter = DOIConverter()
    if len(dois) == 1:
        bibtex = converter.doi_to_bibtex(dois[0])
        if bibtex:
            bibtex_entries = [bibtex]
        else:
            sys.exit(1)
    else:
        bibtex_entries = converter.convert_multiple(dois, delay=args.delay)
    if not bibtex_entries:
        print('Error: No successful conversions', file=sys.stderr)
        sys.exit(1)
    # Format output
    if args.format == 'bibtex':
        output = '\n\n'.join(bibtex_entries) + '\n'
    else:  # json
        output = json.dumps({
            'count': len(bibtex_entries),
            'entries': bibtex_entries
        }, indent=2)
    # Write output
    if args.output:
        try:
            with open(args.output, 'w', encoding='utf-8') as f:
                f.write(output)
            print(f'Successfully wrote {len(bibtex_entries)} entries to {args.output}', file=sys.stderr)
        except Exception as e:
            print(f'Error writing output file: {e}', file=sys.stderr)
            sys.exit(1)
    else:
        print(output)
    # Summary
    if len(dois) > 1:
        success_rate = len(bibtex_entries) / len(dois) * 100
        print(f'\nConverted {len(bibtex_entries)}/{len(dois)} DOIs ({success_rate:.1f}%)', file=sys.stderr)
 if __name__ == '__main__':
    main()
--- a/scripts/extract_metadata.py
+++ b/scripts/extract_metadata.py
@@ -0,0 +1,569 @@
 #!/usr/bin/env python3
 """
 Metadata Extraction Tool
 Extract citation metadata from DOI, PMID, arXiv ID, or URL using various APIs.
 """
 import sys
 import os
 import requests
 import argparse
 import time
 import re
 import json
 import xml.etree.ElementTree as ET
 from typing import Optional, Dict, List, Tuple
 from urllib.parse import urlparse
 class MetadataExtractor:
    """Extract metadata from various sources and generate BibTeX."""
    def __init__(self, email: Optional[str] = None):
        """
        Initialize extractor.
        Args:
            email: Email for Entrez API (recommended for PubMed)
        """
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'MetadataExtractor/1.0 (Citation Management Tool)'
        })
        self.email = email or os.getenv('NCBI_EMAIL', '')
    def identify_type(self, identifier: str) -> Tuple[str, str]:
        """
        Identify the type of identifier.
        Args:
            identifier: DOI, PMID, arXiv ID, or URL
        Returns:
            Tuple of (type, cleaned_identifier)
        """
        identifier = identifier.strip()
        # Check if URL
        if identifier.startswith('http://') or identifier.startswith('https://'):
            return self._parse_url(identifier)
        # Check for DOI
        if identifier.startswith('10.'):
            return ('doi', identifier)
        # Check for arXiv ID
        if re.match(r'^\d{4}\.\d{4,5}(v\d+)?$', identifier):
            return ('arxiv', identifier)
        if identifier.startswith('arXiv:'):
            return ('arxiv', identifier.replace('arXiv:', ''))
        # Check for PMID (8-digit number typically)
        if identifier.isdigit() and len(identifier) >= 7:
            return ('pmid', identifier)
        # Check for PMCID
        if identifier.upper().startswith('PMC') and identifier[3:].isdigit():
            return ('pmcid', identifier.upper())
        return ('unknown', identifier)
    def _parse_url(self, url: str) -> Tuple[str, str]:
        """Parse URL to extract identifier type and value."""
        parsed = urlparse(url)
        # DOI URLs
        if 'doi.org' in parsed.netloc:
            doi = parsed.path.lstrip('/')
            return ('doi', doi)
        # PubMed URLs
        if 'pubmed.ncbi.nlm.nih.gov' in parsed.netloc or 'ncbi.nlm.nih.gov/pubmed' in url:
            pmid = re.search(r'/(\d+)', parsed.path)
            if pmid:
                return ('pmid', pmid.group(1))
        # arXiv URLs
        if 'arxiv.org' in parsed.netloc:
            arxiv_id = re.search(r'/abs/(\d{4}\.\d{4,5})', parsed.path)
            if arxiv_id:
                return ('arxiv', arxiv_id.group(1))
        # Nature, Science, Cell, etc. - try to extract DOI from URL
        doi_match = re.search(r'10\.\d{4,}/[^\s/]+', url)
        if doi_match:
            return ('doi', doi_match.group())
        return ('url', url)
    def extract_from_doi(self, doi: str) -> Optional[Dict]:
        """
        Extract metadata from DOI using CrossRef API.
        Args:
            doi: Digital Object Identifier
        Returns:
            Metadata dictionary or None
        """
        url = f'https://api.crossref.org/works/{doi}'
        try:
            response = self.session.get(url, timeout=15)
            if response.status_code == 200:
                data = response.json()
                message = data.get('message', {})
                metadata = {
                    'type': 'doi',
                    'entry_type': self._crossref_type_to_bibtex(message.get('type')),
                    'doi': doi,
                    'title': message.get('title', [''])[0],
                    'authors': self._format_authors_crossref(message.get('author', [])),
                    'year': self._extract_year_crossref(message),
                    'journal': message.get('container-title', [''])[0] if message.get('container-title') else '',
                    'volume': str(message.get('volume', '')) if message.get('volume') else '',
                    'issue': str(message.get('issue', '')) if message.get('issue') else '',
                    'pages': message.get('page', ''),
                    'publisher': message.get('publisher', ''),
                    'url': f'https://doi.org/{doi}'
                }
                return metadata
            else:
                print(f'Error: CrossRef API returned status {response.status_code} for DOI: {doi}', file=sys.stderr)
                return None
        except Exception as e:
            print(f'Error extracting metadata from DOI {doi}: {e}', file=sys.stderr)
            return None
    def extract_from_pmid(self, pmid: str) -> Optional[Dict]:
        """
        Extract metadata from PMID using PubMed E-utilities.
        Args:
            pmid: PubMed ID
        Returns:
            Metadata dictionary or None
        """
        url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
        params = {
            'db': 'pubmed',
            'id': pmid,
            'retmode': 'xml',
            'rettype': 'abstract'
        }
        if self.email:
            params['email'] = self.email
        api_key = os.getenv('NCBI_API_KEY')
        if api_key:
            params['api_key'] = api_key
        try:
            response = self.session.get(url, params=params, timeout=15)
            if response.status_code == 200:
                root = ET.fromstring(response.content)
                article = root.find('.//PubmedArticle')
                if article is None:
                    print(f'Error: No article found for PMID: {pmid}', file=sys.stderr)
                    return None
                # Extract metadata from XML
                medline_citation = article.find('.//MedlineCitation')
                article_elem = medline_citation.find('.//Article')
                journal = article_elem.find('.//Journal')
                # Get DOI if available
                doi = None
                article_ids = article.findall('.//ArticleId')
                for article_id in article_ids:
                    if article_id.get('IdType') == 'doi':
                        doi = article_id.text
                        break
                metadata = {
                    'type': 'pmid',
                    'entry_type': 'article',
                    'pmid': pmid,
                    'title': article_elem.findtext('.//ArticleTitle', ''),
                    'authors': self._format_authors_pubmed(article_elem.findall('.//Author')),
                    'year': self._extract_year_pubmed(article_elem),
                    'journal': journal.findtext('.//Title', ''),
                    'volume': journal.findtext('.//JournalIssue/Volume', ''),
                    'issue': journal.findtext('.//JournalIssue/Issue', ''),
                    'pages': article_elem.findtext('.//Pagination/MedlinePgn', ''),
                    'doi': doi
                }
                return metadata
            else:
                print(f'Error: PubMed API returned status {response.status_code} for PMID: {pmid}', file=sys.stderr)
                return None
        except Exception as e:
            print(f'Error extracting metadata from PMID {pmid}: {e}', file=sys.stderr)
            return None
    def extract_from_arxiv(self, arxiv_id: str) -> Optional[Dict]:
        """
        Extract metadata from arXiv ID using arXiv API.
        Args:
            arxiv_id: arXiv identifier
        Returns:
            Metadata dictionary or None
        """
        url = 'http://export.arxiv.org/api/query'
        params = {
            'id_list': arxiv_id,
            'max_results': 1
        }
        try:
            response = self.session.get(url, params=params, timeout=15)
            if response.status_code == 200:
                # Parse Atom XML
                root = ET.fromstring(response.content)
                ns = {'atom': 'http://www.w3.org/2005/Atom', 'arxiv': 'http://arxiv.org/schemas/atom'}
                entry = root.find('atom:entry', ns)
                if entry is None:
                    print(f'Error: No entry found for arXiv ID: {arxiv_id}', file=sys.stderr)
                    return None
                # Extract DOI if published
                doi_elem = entry.find('arxiv:doi', ns)
                doi = doi_elem.text if doi_elem is not None else None
                # Extract journal reference if published
                journal_ref_elem = entry.find('arxiv:journal_ref', ns)
                journal_ref = journal_ref_elem.text if journal_ref_elem is not None else None
                # Get publication date
                published = entry.findtext('atom:published', '', ns)
                year = published[:4] if published else ''
                # Get authors
                authors = []
                for author in entry.findall('atom:author', ns):
                    name = author.findtext('atom:name', '', ns)
                    if name:
                        authors.append(name)
                metadata = {
                    'type': 'arxiv',
                    'entry_type': 'misc' if not doi else 'article',
                    'arxiv_id': arxiv_id,
                    'title': entry.findtext('atom:title', '', ns).strip().replace('\n', ' '),
                    'authors': ' and '.join(authors),
                    'year': year,
                    'doi': doi,
                    'journal_ref': journal_ref,
                    'abstract': entry.findtext('atom:summary', '', ns).strip().replace('\n', ' '),
                    'url': f'https://arxiv.org/abs/{arxiv_id}'
                }
                return metadata
            else:
                print(f'Error: arXiv API returned status {response.status_code} for ID: {arxiv_id}', file=sys.stderr)
                return None
        except Exception as e:
            print(f'Error extracting metadata from arXiv {arxiv_id}: {e}', file=sys.stderr)
            return None
    def metadata_to_bibtex(self, metadata: Dict, citation_key: Optional[str] = None) -> str:
        """
        Convert metadata dictionary to BibTeX format.
        Args:
            metadata: Metadata dictionary
            citation_key: Optional custom citation key
        Returns:
            BibTeX string
        """
        if not citation_key:
            citation_key = self._generate_citation_key(metadata)
        entry_type = metadata.get('entry_type', 'misc')
        # Build BibTeX entry
        lines = [f'@{entry_type}{{{citation_key},']
        # Add fields
        if metadata.get('authors'):
            lines.append(f'  author  = {{{metadata["authors"]}}},')
        if metadata.get('title'):
            # Protect capitalization
            title = self._protect_title(metadata['title'])
            lines.append(f'  title   = {{{title}}},')
        if entry_type == 'article' and metadata.get('journal'):
            lines.append(f'  journal = {{{metadata["journal"]}}},')
        elif entry_type == 'misc' and metadata.get('type') == 'arxiv':
            lines.append(f'  howpublished = {{arXiv}},')
        if metadata.get('year'):
            lines.append(f'  year    = {{{metadata["year"]}}},')
        if metadata.get('volume'):
            lines.append(f'  volume  = {{{metadata["volume"]}}},')
        if metadata.get('issue'):
            lines.append(f'  number  = {{{metadata["issue"]}}},')
        if metadata.get('pages'):
            pages = metadata['pages'].replace('-', '--')  # En-dash
            lines.append(f'  pages   = {{{pages}}},')
        if metadata.get('doi'):
            lines.append(f'  doi     = {{{metadata["doi"]}}},')
        elif metadata.get('url'):
            lines.append(f'  url     = {{{metadata["url"]}}},')
        if metadata.get('pmid'):
            lines.append(f'  note    = {{PMID: {metadata["pmid"]}}},')
        if metadata.get('type') == 'arxiv' and not metadata.get('doi'):
            lines.append(f'  note    = {{Preprint}},')
        # Remove trailing comma from last field
        if lines[-1].endswith(','):
            lines[-1] = lines[-1][:-1]
        lines.append('}')
        return '\n'.join(lines)
    def _crossref_type_to_bibtex(self, crossref_type: str) -> str:
        """Map CrossRef type to BibTeX entry type."""
        type_map = {
            'journal-article': 'article',
            'book': 'book',
            'book-chapter': 'incollection',
            'proceedings-article': 'inproceedings',
            'posted-content': 'misc',
            'dataset': 'misc',
            'report': 'techreport'
        }
        return type_map.get(crossref_type, 'misc')
    def _format_authors_crossref(self, authors: List[Dict]) -> str:
        """Format author list from CrossRef data."""
        if not authors:
            return ''
        formatted = []
        for author in authors:
            given = author.get('given', '')
            family = author.get('family', '')
            if family:
                if given:
                    formatted.append(f'{family}, {given}')
                else:
                    formatted.append(family)
        return ' and '.join(formatted)
    def _format_authors_pubmed(self, authors: List) -> str:
        """Format author list from PubMed XML."""
        formatted = []
        for author in authors:
            last_name = author.findtext('.//LastName', '')
            fore_name = author.findtext('.//ForeName', '')
            if last_name:
                if fore_name:
                    formatted.append(f'{last_name}, {fore_name}')
                else:
                    formatted.append(last_name)
        return ' and '.join(formatted)
    def _extract_year_crossref(self, message: Dict) -> str:
        """Extract year from CrossRef message."""
        # Try published-print first, then published-online
        date_parts = message.get('published-print', {}).get('date-parts', [[]])
        if not date_parts or not date_parts[0]:
            date_parts = message.get('published-online', {}).get('date-parts', [[]])
        if date_parts and date_parts[0]:
            return str(date_parts[0][0])
        return ''
    def _extract_year_pubmed(self, article: ET.Element) -> str:
        """Extract year from PubMed XML."""
        year = article.findtext('.//Journal/JournalIssue/PubDate/Year', '')
        if not year:
            medline_date = article.findtext('.//Journal/JournalIssue/PubDate/MedlineDate', '')
            if medline_date:
                year_match = re.search(r'\d{4}', medline_date)
                if year_match:
                    year = year_match.group()
        return year
    def _generate_citation_key(self, metadata: Dict) -> str:
        """Generate a citation key from metadata."""
        # Get first author last name
        authors = metadata.get('authors', '')
        if authors:
            first_author = authors.split(' and ')[0]
            if ',' in first_author:
                last_name = first_author.split(',')[0].strip()
            else:
                last_name = first_author.split()[-1] if first_author else 'Unknown'
        else:
            last_name = 'Unknown'
        # Get year
        year = metadata.get('year', '').strip()
        if not year:
            year = 'XXXX'
        # Clean last name (remove special characters)
        last_name = re.sub(r'[^a-zA-Z]', '', last_name)
        # Get keyword from title
        title = metadata.get('title', '')
        words = re.findall(r'\b[a-zA-Z]{4,}\b', title)
        keyword = words[0].lower() if words else 'paper'
        return f'{last_name}{year}{keyword}'
    def _protect_title(self, title: str) -> str:
        """Protect capitalization in title for BibTeX."""
        # Protect common acronyms and proper nouns
        protected_words = [
            'DNA', 'RNA', 'CRISPR', 'COVID', 'HIV', 'AIDS', 'AlphaFold',
            'Python', 'AI', 'ML', 'GPU', 'CPU', 'USA', 'UK', 'EU'
        ]
        for word in protected_words:
            title = re.sub(rf'\b{word}\b', f'{{{word}}}', title, flags=re.IGNORECASE)
        return title
    def extract(self, identifier: str) -> Optional[str]:
        """
        Extract metadata and return BibTeX.
        Args:
            identifier: DOI, PMID, arXiv ID, or URL
        Returns:
            BibTeX string or None
        """
        id_type, clean_id = self.identify_type(identifier)
        print(f'Identified as {id_type}: {clean_id}', file=sys.stderr)
        metadata = None
        if id_type == 'doi':
            metadata = self.extract_from_doi(clean_id)
        elif id_type == 'pmid':
            metadata = self.extract_from_pmid(clean_id)
        elif id_type == 'arxiv':
            metadata = self.extract_from_arxiv(clean_id)
        else:
            print(f'Error: Unknown identifier type: {identifier}', file=sys.stderr)
            return None
        if metadata:
            return self.metadata_to_bibtex(metadata)
        else:
            return None
 def main():
    """Command-line interface."""
    parser = argparse.ArgumentParser(
        description='Extract citation metadata from DOI, PMID, arXiv ID, or URL',
        epilog='Example: python extract_metadata.py --doi 10.1038/s41586-021-03819-2'
    )
    parser.add_argument('--doi', help='Digital Object Identifier')
    parser.add_argument('--pmid', help='PubMed ID')
    parser.add_argument('--arxiv', help='arXiv ID')
    parser.add_argument('--url', help='URL to article')
    parser.add_argument('-i', '--input', help='Input file with identifiers (one per line)')
    parser.add_argument('-o', '--output', help='Output file for BibTeX (default: stdout)')
    parser.add_argument('--format', choices=['bibtex', 'json'], default='bibtex', help='Output format')
    parser.add_argument('--email', help='Email for NCBI E-utilities (recommended)')
    args = parser.parse_args()
    # Collect identifiers
    identifiers = []
    if args.doi:
        identifiers.append(args.doi)
    if args.pmid:
        identifiers.append(args.pmid)
    if args.arxiv:
        identifiers.append(args.arxiv)
    if args.url:
        identifiers.append(args.url)
    if args.input:
        try:
            with open(args.input, 'r', encoding='utf-8') as f:
                file_ids = [line.strip() for line in f if line.strip()]
                identifiers.extend(file_ids)
        except Exception as e:
            print(f'Error reading input file: {e}', file=sys.stderr)
            sys.exit(1)
    if not identifiers:
        parser.print_help()
        sys.exit(1)
    # Extract metadata
    extractor = MetadataExtractor(email=args.email)
    bibtex_entries = []
    for i, identifier in enumerate(identifiers):
        print(f'\nProcessing {i+1}/{len(identifiers)}...', file=sys.stderr)
        bibtex = extractor.extract(identifier)
        if bibtex:
            bibtex_entries.append(bibtex)
        # Rate limiting
        if i < len(identifiers) - 1:
            time.sleep(0.5)
    if not bibtex_entries:
        print('Error: No successful extractions', file=sys.stderr)
        sys.exit(1)
    # Format output
    if args.format == 'bibtex':
        output = '\n\n'.join(bibtex_entries) + '\n'
    else:  # json
        output = json.dumps({
            'count': len(bibtex_entries),
            'entries': bibtex_entries
        }, indent=2)
    # Write output
    if args.output:
        with open(args.output, 'w', encoding='utf-8') as f:
            f.write(output)
        print(f'\nSuccessfully wrote {len(bibtex_entries)} entries to {args.output}', file=sys.stderr)
    else:
        print(output)
    print(f'\nExtracted {len(bibtex_entries)}/{len(identifiers)} entries', file=sys.stderr)
 if __name__ == '__main__':
    main()
--- a/scripts/format_bibtex.py
+++ b/scripts/format_bibtex.py
@@ -0,0 +1,349 @@
 #!/usr/bin/env python3
 """
 BibTeX Formatter and Cleaner
 Format, clean, sort, and deduplicate BibTeX files.
 """
 import sys
 import re
 import argparse
 from typing import List, Dict, Tuple
 from collections import OrderedDict
 class BibTeXFormatter:
    """Format and clean BibTeX entries."""
    def __init__(self):
        # Standard field order for readability
        self.field_order = [
            'author', 'editor', 'title', 'booktitle', 'journal',
            'year', 'month', 'volume', 'number', 'pages',
            'publisher', 'address', 'edition', 'series',
            'school', 'institution', 'organization',
            'howpublished', 'doi', 'url', 'isbn', 'issn',
            'note', 'abstract', 'keywords'
        ]
    def parse_bibtex_file(self, filepath: str) -> List[Dict]:
        """
        Parse BibTeX file and extract entries.
        Args:
            filepath: Path to BibTeX file
        Returns:
            List of entry dictionaries
        """
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read()
        except Exception as e:
            print(f'Error reading file: {e}', file=sys.stderr)
            return []
        entries = []
        # Match BibTeX entries
        pattern = r'@(\w+)\s*\{\s*([^,\s]+)\s*,(.*?)\n\}'
        matches = re.finditer(pattern, content, re.DOTALL | re.IGNORECASE)
        for match in matches:
            entry_type = match.group(1).lower()
            citation_key = match.group(2).strip()
            fields_text = match.group(3)
            # Parse fields
            fields = OrderedDict()
            field_pattern = r'(\w+)\s*=\s*\{([^}]*)\}|(\w+)\s*=\s*"([^"]*)"'
            field_matches = re.finditer(field_pattern, fields_text)
            for field_match in field_matches:
                if field_match.group(1):
                    field_name = field_match.group(1).lower()
                    field_value = field_match.group(2)
                else:
                    field_name = field_match.group(3).lower()
                    field_value = field_match.group(4)
                fields[field_name] = field_value.strip()
            entries.append({
                'type': entry_type,
                'key': citation_key,
                'fields': fields
            })
        return entries
    def format_entry(self, entry: Dict) -> str:
        """
        Format a single BibTeX entry.
        Args:
            entry: Entry dictionary
        Returns:
            Formatted BibTeX string
        """
        lines = [f'@{entry["type"]}{{{entry["key"]},']
        # Order fields according to standard order
        ordered_fields = OrderedDict()
        # Add fields in standard order
        for field_name in self.field_order:
            if field_name in entry['fields']:
                ordered_fields[field_name] = entry['fields'][field_name]
        # Add any remaining fields
        for field_name, field_value in entry['fields'].items():
            if field_name not in ordered_fields:
                ordered_fields[field_name] = field_value
        # Format each field
        max_field_len = max(len(f) for f in ordered_fields.keys()) if ordered_fields else 0
        for field_name, field_value in ordered_fields.items():
            # Pad field name for alignment
            padded_field = field_name.ljust(max_field_len)
            lines.append(f'  {padded_field} = {{{field_value}}},')
        # Remove trailing comma from last field
        if lines[-1].endswith(','):
            lines[-1] = lines[-1][:-1]
        lines.append('}')
        return '\n'.join(lines)
    def fix_common_issues(self, entry: Dict) -> Dict:
        """
        Fix common formatting issues in entry.
        Args:
            entry: Entry dictionary
        Returns:
            Fixed entry dictionary
        """
        fixed = entry.copy()
        fields = fixed['fields'].copy()
        # Fix page ranges (single hyphen to double hyphen)
        if 'pages' in fields:
            pages = fields['pages']
            # Replace single hyphen with double hyphen if it's a range
            if re.search(r'\d-\d', pages) and '--' not in pages:
                pages = re.sub(r'(\d)-(\d)', r'\1--\2', pages)
                fields['pages'] = pages
        # Remove "pp." from pages
        if 'pages' in fields:
            pages = fields['pages']
            pages = re.sub(r'^pp\.\s*', '', pages, flags=re.IGNORECASE)
            fields['pages'] = pages
        # Fix DOI (remove URL prefix if present)
        if 'doi' in fields:
            doi = fields['doi']
            doi = doi.replace('https://doi.org/', '')
            doi = doi.replace('http://doi.org/', '')
            doi = doi.replace('doi:', '')
            fields['doi'] = doi
        # Fix author separators (semicolon or ampersand to 'and')
        if 'author' in fields:
            author = fields['author']
            author = author.replace(';', ' and')
            author = author.replace(' & ', ' and ')
            # Clean up multiple 'and's
            author = re.sub(r'\s+and\s+and\s+', ' and ', author)
            fields['author'] = author
        fixed['fields'] = fields
        return fixed
    def deduplicate_entries(self, entries: List[Dict]) -> List[Dict]:
        """
        Remove duplicate entries based on DOI or citation key.
        Args:
            entries: List of entry dictionaries
        Returns:
            List of unique entries
        """
        seen_dois = set()
        seen_keys = set()
        unique_entries = []
        for entry in entries:
            doi = entry['fields'].get('doi', '').strip()
            key = entry['key']
            # Check DOI first (more reliable)
            if doi:
                if doi in seen_dois:
                    print(f'Duplicate DOI found: {doi} (skipping {key})', file=sys.stderr)
                    continue
                seen_dois.add(doi)
            # Check citation key
            if key in seen_keys:
                print(f'Duplicate citation key found: {key} (skipping)', file=sys.stderr)
                continue
            seen_keys.add(key)
            unique_entries.append(entry)
        return unique_entries
    def sort_entries(self, entries: List[Dict], sort_by: str = 'key', descending: bool = False) -> List[Dict]:
        """
        Sort entries by specified field.
        Args:
            entries: List of entry dictionaries
            sort_by: Field to sort by ('key', 'year', 'author', 'title')
            descending: Sort in descending order
        Returns:
            Sorted list of entries
        """
        def get_sort_key(entry: Dict) -> str:
            if sort_by == 'key':
                return entry['key'].lower()
            elif sort_by == 'year':
                year = entry['fields'].get('year', '9999')
                return year
            elif sort_by == 'author':
                author = entry['fields'].get('author', 'ZZZ')
                # Get last name of first author
                if ',' in author:
                    return author.split(',')[0].lower()
                else:
                    return author.split()[0].lower() if author else 'zzz'
            elif sort_by == 'title':
                return entry['fields'].get('title', '').lower()
            else:
                return entry['key'].lower()
        return sorted(entries, key=get_sort_key, reverse=descending)
    def format_file(self, filepath: str, output: str = None,
                   deduplicate: bool = False, sort_by: str = None,
                   descending: bool = False, fix_issues: bool = True) -> None:
        """
        Format entire BibTeX file.
        Args:
            filepath: Input BibTeX file
            output: Output file (None for in-place)
            deduplicate: Remove duplicates
            sort_by: Field to sort by
            descending: Sort in descending order
            fix_issues: Fix common formatting issues
        """
        print(f'Parsing {filepath}...', file=sys.stderr)
        entries = self.parse_bibtex_file(filepath)
        if not entries:
            print('No entries found', file=sys.stderr)
            return
        print(f'Found {len(entries)} entries', file=sys.stderr)
        # Fix common issues
        if fix_issues:
            print('Fixing common issues...', file=sys.stderr)
            entries = [self.fix_common_issues(e) for e in entries]
        # Deduplicate
        if deduplicate:
            print('Removing duplicates...', file=sys.stderr)
            original_count = len(entries)
            entries = self.deduplicate_entries(entries)
            removed = original_count - len(entries)
            if removed > 0:
                print(f'Removed {removed} duplicate(s)', file=sys.stderr)
        # Sort
        if sort_by:
            print(f'Sorting by {sort_by}...', file=sys.stderr)
            entries = self.sort_entries(entries, sort_by, descending)
        # Format entries
        print('Formatting entries...', file=sys.stderr)
        formatted_entries = [self.format_entry(e) for e in entries]
        # Write output
        output_content = '\n\n'.join(formatted_entries) + '\n'
        output_file = output or filepath
        try:
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(output_content)
            print(f'Successfully wrote {len(entries)} entries to {output_file}', file=sys.stderr)
        except Exception as e:
            print(f'Error writing file: {e}', file=sys.stderr)
            sys.exit(1)
 def main():
    """Command-line interface."""
    parser = argparse.ArgumentParser(
        description='Format, clean, sort, and deduplicate BibTeX files',
        epilog='Example: python format_bibtex.py references.bib --deduplicate --sort year'
    )
    parser.add_argument(
        'file',
        help='BibTeX file to format'
    )
    parser.add_argument(
        '-o', '--output',
        help='Output file (default: overwrite input file)'
    )
    parser.add_argument(
        '--deduplicate',
        action='store_true',
        help='Remove duplicate entries'
    )
    parser.add_argument(
        '--sort',
        choices=['key', 'year', 'author', 'title'],
        help='Sort entries by field'
    )
    parser.add_argument(
        '--descending',
        action='store_true',
        help='Sort in descending order'
    )
    parser.add_argument(
        '--no-fix',
        action='store_true',
        help='Do not fix common issues'
    )
    args = parser.parse_args()
    # Format file
    formatter = BibTeXFormatter()
    formatter.format_file(
        args.file,
        output=args.output,
        deduplicate=args.deduplicate,
        sort_by=args.sort,
        descending=args.descending,
        fix_issues=not args.no_fix
    )
 if __name__ == '__main__':
    main()
--- a/scripts/search_google_scholar.py
+++ b/scripts/search_google_scholar.py
@@ -0,0 +1,282 @@
 #!/usr/bin/env python3
 """
 Google Scholar Search Tool
 Search Google Scholar and export results.
 Note: This script requires the 'scholarly' library.
 Install with: pip install scholarly
 """
 import sys
 import argparse
 import json
 import time
 import random
 from typing import List, Dict, Optional
 try:
    from scholarly import scholarly, ProxyGenerator
    SCHOLARLY_AVAILABLE = True
 except ImportError:
    SCHOLARLY_AVAILABLE = False
    print('Warning: scholarly library not installed. Install with: pip install scholarly', file=sys.stderr)
 class GoogleScholarSearcher:
    """Search Google Scholar using scholarly library."""
    def __init__(self, use_proxy: bool = False):
        """
        Initialize searcher.
        Args:
            use_proxy: Use free proxy (helps avoid rate limiting)
        """
        if not SCHOLARLY_AVAILABLE:
            raise ImportError('scholarly library required. Install with: pip install scholarly')
        # Setup proxy if requested
        if use_proxy:
            try:
                pg = ProxyGenerator()
                pg.FreeProxies()
                scholarly.use_proxy(pg)
                print('Using free proxy', file=sys.stderr)
            except Exception as e:
                print(f'Warning: Could not setup proxy: {e}', file=sys.stderr)
    def search(self, query: str, max_results: int = 50,
               year_start: Optional[int] = None, year_end: Optional[int] = None,
               sort_by: str = 'relevance') -> List[Dict]:
        """
        Search Google Scholar.
        Args:
            query: Search query
            max_results: Maximum number of results
            year_start: Start year filter
            year_end: End year filter
            sort_by: Sort order ('relevance' or 'citations')
        Returns:
            List of result dictionaries
        """
        if not SCHOLARLY_AVAILABLE:
            print('Error: scholarly library not installed', file=sys.stderr)
            return []
        print(f'Searching Google Scholar: {query}', file=sys.stderr)
        print(f'Max results: {max_results}', file=sys.stderr)
        results = []
        try:
            # Perform search
            search_query = scholarly.search_pubs(query)
            for i, result in enumerate(search_query):
                if i >= max_results:
                    break
                print(f'Retrieved {i+1}/{max_results}', file=sys.stderr)
                # Extract metadata
                metadata = {
                    'title': result.get('bib', {}).get('title', ''),
                    'authors': ', '.join(result.get('bib', {}).get('author', [])),
                    'year': result.get('bib', {}).get('pub_year', ''),
                    'venue': result.get('bib', {}).get('venue', ''),
                    'abstract': result.get('bib', {}).get('abstract', ''),
                    'citations': result.get('num_citations', 0),
                    'url': result.get('pub_url', ''),
                    'eprint_url': result.get('eprint_url', ''),
                }
                # Filter by year
                if year_start or year_end:
                    try:
                        pub_year = int(metadata['year']) if metadata['year'] else 0
                        if year_start and pub_year < year_start:
                            continue
                        if year_end and pub_year > year_end:
                            continue
                    except ValueError:
                        pass
                results.append(metadata)
                # Rate limiting to avoid blocking
                time.sleep(random.uniform(2, 5))
        except Exception as e:
            print(f'Error during search: {e}', file=sys.stderr)
        # Sort if requested
        if sort_by == 'citations' and results:
            results.sort(key=lambda x: x.get('citations', 0), reverse=True)
        return results
    def metadata_to_bibtex(self, metadata: Dict) -> str:
        """Convert metadata to BibTeX format."""
        # Generate citation key
        if metadata.get('authors'):
            first_author = metadata['authors'].split(',')[0].strip()
            last_name = first_author.split()[-1] if first_author else 'Unknown'
        else:
            last_name = 'Unknown'
        year = metadata.get('year', 'XXXX')
        # Get keyword from title
        import re
        title = metadata.get('title', '')
        words = re.findall(r'\b[a-zA-Z]{4,}\b', title)
        keyword = words[0].lower() if words else 'paper'
        citation_key = f'{last_name}{year}{keyword}'
        # Determine entry type (guess based on venue)
        venue = metadata.get('venue', '').lower()
        if 'proceedings' in venue or 'conference' in venue:
            entry_type = 'inproceedings'
            venue_field = 'booktitle'
        else:
            entry_type = 'article'
            venue_field = 'journal'
        # Build BibTeX
        lines = [f'@{entry_type}{{{citation_key},']
        # Convert authors format
        if metadata.get('authors'):
            authors = metadata['authors'].replace(',', ' and')
            lines.append(f'  author  = {{{authors}}},')
        if metadata.get('title'):
            lines.append(f'  title   = {{{metadata["title"]}}},')
        if metadata.get('venue'):
            lines.append(f'  {venue_field} = {{{metadata["venue"]}}},')
        if metadata.get('year'):
            lines.append(f'  year    = {{{metadata["year"]}}},')
        if metadata.get('url'):
            lines.append(f'  url     = {{{metadata["url"]}}},')
        if metadata.get('citations'):
            lines.append(f'  note    = {{Cited by: {metadata["citations"]}}},')
        # Remove trailing comma
        if lines[-1].endswith(','):
            lines[-1] = lines[-1][:-1]
        lines.append('}')
        return '\n'.join(lines)
 def main():
    """Command-line interface."""
    parser = argparse.ArgumentParser(
        description='Search Google Scholar (requires scholarly library)',
        epilog='Example: python search_google_scholar.py "machine learning" --limit 50'
    )
    parser.add_argument(
        'query',
        help='Search query'
    )
    parser.add_argument(
        '--limit',
        type=int,
        default=50,
        help='Maximum number of results (default: 50)'
    )
    parser.add_argument(
        '--year-start',
        type=int,
        help='Start year for filtering'
    )
    parser.add_argument(
        '--year-end',
        type=int,
        help='End year for filtering'
    )
    parser.add_argument(
        '--sort-by',
        choices=['relevance', 'citations'],
        default='relevance',
        help='Sort order (default: relevance)'
    )
    parser.add_argument(
        '--use-proxy',
        action='store_true',
        help='Use free proxy to avoid rate limiting'
    )
    parser.add_argument(
        '-o', '--output',
        help='Output file (default: stdout)'
    )
    parser.add_argument(
        '--format',
        choices=['json', 'bibtex'],
        default='json',
        help='Output format (default: json)'
    )
    args = parser.parse_args()
    if not SCHOLARLY_AVAILABLE:
        print('\nError: scholarly library not installed', file=sys.stderr)
        print('Install with: pip install scholarly', file=sys.stderr)
        print('\nAlternatively, use PubMed search for biomedical literature:', file=sys.stderr)
        print('  python search_pubmed.py "your query"', file=sys.stderr)
        sys.exit(1)
    # Search
    searcher = GoogleScholarSearcher(use_proxy=args.use_proxy)
    results = searcher.search(
        args.query,
        max_results=args.limit,
        year_start=args.year_start,
        year_end=args.year_end,
        sort_by=args.sort_by
    )
    if not results:
        print('No results found', file=sys.stderr)
        sys.exit(1)
    # Format output
    if args.format == 'json':
        output = json.dumps({
            'query': args.query,
            'count': len(results),
            'results': results
        }, indent=2)
    else:  # bibtex
        bibtex_entries = [searcher.metadata_to_bibtex(r) for r in results]
        output = '\n\n'.join(bibtex_entries) + '\n'
    # Write output
    if args.output:
        with open(args.output, 'w', encoding='utf-8') as f:
            f.write(output)
        print(f'Wrote {len(results)} results to {args.output}', file=sys.stderr)
    else:
        print(output)
    print(f'\nRetrieved {len(results)} results', file=sys.stderr)
 if __name__ == '__main__':
    main()
--- a/scripts/search_pubmed.py
+++ b/scripts/search_pubmed.py
@@ -0,0 +1,398 @@
 #!/usr/bin/env python3
 """
 PubMed Search Tool
 Search PubMed using E-utilities API and export results.
 """
 import sys
 import os
 import requests
 import argparse
 import json
 import time
 import xml.etree.ElementTree as ET
 from typing import List, Dict, Optional
 from datetime import datetime
 class PubMedSearcher:
    """Search PubMed using NCBI E-utilities API."""
    def __init__(self, api_key: Optional[str] = None, email: Optional[str] = None):
        """
        Initialize searcher.
        Args:
            api_key: NCBI API key (optional but recommended)
            email: Email for Entrez (optional but recommended)
        """
        self.api_key = api_key or os.getenv('NCBI_API_KEY', '')
        self.email = email or os.getenv('NCBI_EMAIL', '')
        self.base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
        self.session = requests.Session()
        # Rate limiting
        self.delay = 0.11 if self.api_key else 0.34  # 10/sec with key, 3/sec without
    def search(self, query: str, max_results: int = 100,
               date_start: Optional[str] = None, date_end: Optional[str] = None,
               publication_types: Optional[List[str]] = None) -> List[str]:
        """
        Search PubMed and return PMIDs.
        Args:
            query: Search query
            max_results: Maximum number of results
            date_start: Start date (YYYY/MM/DD or YYYY)
            date_end: End date (YYYY/MM/DD or YYYY)
            publication_types: List of publication types to filter
        Returns:
            List of PMIDs
        """
        # Build query with filters
        full_query = query
        # Add date range
        if date_start or date_end:
            start = date_start or '1900'
            end = date_end or datetime.now().strftime('%Y')
            full_query += f' AND {start}:{end}[Publication Date]'
        # Add publication types
        if publication_types:
            pub_type_query = ' OR '.join([f'"{pt}"[Publication Type]' for pt in publication_types])
            full_query += f' AND ({pub_type_query})'
        print(f'Searching PubMed: {full_query}', file=sys.stderr)
        # ESearch to get PMIDs
        esearch_url = self.base_url + 'esearch.fcgi'
        params = {
            'db': 'pubmed',
            'term': full_query,
            'retmax': max_results,
            'retmode': 'json'
        }
        if self.email:
            params['email'] = self.email
        if self.api_key:
            params['api_key'] = self.api_key
        try:
            response = self.session.get(esearch_url, params=params, timeout=30)
            response.raise_for_status()
            data = response.json()
            pmids = data['esearchresult']['idlist']
            count = int(data['esearchresult']['count'])
            print(f'Found {count} results, retrieving {len(pmids)}', file=sys.stderr)
            return pmids
        except Exception as e:
            print(f'Error searching PubMed: {e}', file=sys.stderr)
            return []
    def fetch_metadata(self, pmids: List[str]) -> List[Dict]:
        """
        Fetch metadata for PMIDs.
        Args:
            pmids: List of PubMed IDs
        Returns:
            List of metadata dictionaries
        """
        if not pmids:
            return []
        metadata_list = []
        # Fetch in batches of 200
        batch_size = 200
        for i in range(0, len(pmids), batch_size):
            batch = pmids[i:i+batch_size]
            print(f'Fetching metadata for PMIDs {i+1}-{min(i+batch_size, len(pmids))}...', file=sys.stderr)
            efetch_url = self.base_url + 'efetch.fcgi'
            params = {
                'db': 'pubmed',
                'id': ','.join(batch),
                'retmode': 'xml',
                'rettype': 'abstract'
            }
            if self.email:
                params['email'] = self.email
            if self.api_key:
                params['api_key'] = self.api_key
            try:
                response = self.session.get(efetch_url, params=params, timeout=60)
                response.raise_for_status()
                # Parse XML
                root = ET.fromstring(response.content)
                articles = root.findall('.//PubmedArticle')
                for article in articles:
                    metadata = self._extract_metadata_from_xml(article)
                    if metadata:
                        metadata_list.append(metadata)
                # Rate limiting
                time.sleep(self.delay)
            except Exception as e:
                print(f'Error fetching metadata for batch: {e}', file=sys.stderr)
                continue
        return metadata_list
    def _extract_metadata_from_xml(self, article: ET.Element) -> Optional[Dict]:
        """Extract metadata from PubmedArticle XML element."""
        try:
            medline_citation = article.find('.//MedlineCitation')
            article_elem = medline_citation.find('.//Article')
            journal = article_elem.find('.//Journal')
            # Get PMID
            pmid = medline_citation.findtext('.//PMID', '')
            # Get DOI
            doi = None
            article_ids = article.findall('.//ArticleId')
            for article_id in article_ids:
                if article_id.get('IdType') == 'doi':
                    doi = article_id.text
                    break
            # Get authors
            authors = []
            author_list = article_elem.find('.//AuthorList')
            if author_list is not None:
                for author in author_list.findall('.//Author'):
                    last_name = author.findtext('.//LastName', '')
                    fore_name = author.findtext('.//ForeName', '')
                    if last_name:
                        if fore_name:
                            authors.append(f'{last_name}, {fore_name}')
                        else:
                            authors.append(last_name)
            # Get year
            year = article_elem.findtext('.//Journal/JournalIssue/PubDate/Year', '')
            if not year:
                medline_date = article_elem.findtext('.//Journal/JournalIssue/PubDate/MedlineDate', '')
                if medline_date:
                    import re
                    year_match = re.search(r'\d{4}', medline_date)
                    if year_match:
                        year = year_match.group()
            metadata = {
                'pmid': pmid,
                'doi': doi,
                'title': article_elem.findtext('.//ArticleTitle', ''),
                'authors': ' and '.join(authors),
                'journal': journal.findtext('.//Title', ''),
                'year': year,
                'volume': journal.findtext('.//JournalIssue/Volume', ''),
                'issue': journal.findtext('.//JournalIssue/Issue', ''),
                'pages': article_elem.findtext('.//Pagination/MedlinePgn', ''),
                'abstract': article_elem.findtext('.//Abstract/AbstractText', '')
            }
            return metadata
        except Exception as e:
            print(f'Error extracting metadata: {e}', file=sys.stderr)
            return None
    def metadata_to_bibtex(self, metadata: Dict) -> str:
        """Convert metadata to BibTeX format."""
        # Generate citation key
        if metadata.get('authors'):
            first_author = metadata['authors'].split(' and ')[0]
            if ',' in first_author:
                last_name = first_author.split(',')[0].strip()
            else:
                last_name = first_author.split()[0]
        else:
            last_name = 'Unknown'
        year = metadata.get('year', 'XXXX')
        citation_key = f'{last_name}{year}pmid{metadata.get("pmid", "")}'
        # Build BibTeX entry
        lines = [f'@article{{{citation_key},']
        if metadata.get('authors'):
            lines.append(f'  author  = {{{metadata["authors"]}}},')
        if metadata.get('title'):
            lines.append(f'  title   = {{{metadata["title"]}}},')
        if metadata.get('journal'):
            lines.append(f'  journal = {{{metadata["journal"]}}},')
        if metadata.get('year'):
            lines.append(f'  year    = {{{metadata["year"]}}},')
        if metadata.get('volume'):
            lines.append(f'  volume  = {{{metadata["volume"]}}},')
        if metadata.get('issue'):
            lines.append(f'  number  = {{{metadata["issue"]}}},')
        if metadata.get('pages'):
            pages = metadata['pages'].replace('-', '--')
            lines.append(f'  pages   = {{{pages}}},')
        if metadata.get('doi'):
            lines.append(f'  doi     = {{{metadata["doi"]}}},')
        if metadata.get('pmid'):
            lines.append(f'  note    = {{PMID: {metadata["pmid"]}}},')
        # Remove trailing comma
        if lines[-1].endswith(','):
            lines[-1] = lines[-1][:-1]
        lines.append('}')
        return '\n'.join(lines)
 def main():
    """Command-line interface."""
    parser = argparse.ArgumentParser(
        description='Search PubMed using E-utilities API',
        epilog='Example: python search_pubmed.py "CRISPR gene editing" --limit 100'
    )
    parser.add_argument(
        'query',
        nargs='?',
        help='Search query (PubMed syntax)'
    )
    parser.add_argument(
        '--query',
        dest='query_arg',
        help='Search query (alternative to positional argument)'
    )
    parser.add_argument(
        '--query-file',
        help='File containing search query'
    )
    parser.add_argument(
        '--limit',
        type=int,
        default=100,
        help='Maximum number of results (default: 100)'
    )
    parser.add_argument(
        '--date-start',
        help='Start date (YYYY/MM/DD or YYYY)'
    )
    parser.add_argument(
        '--date-end',
        help='End date (YYYY/MM/DD or YYYY)'
    )
    parser.add_argument(
        '--publication-types',
        help='Comma-separated publication types (e.g., "Review,Clinical Trial")'
    )
    parser.add_argument(
        '-o', '--output',
        help='Output file (default: stdout)'
    )
    parser.add_argument(
        '--format',
        choices=['json', 'bibtex'],
        default='json',
        help='Output format (default: json)'
    )
    parser.add_argument(
        '--api-key',
        help='NCBI API key (or set NCBI_API_KEY env var)'
    )
    parser.add_argument(
        '--email',
        help='Email for Entrez (or set NCBI_EMAIL env var)'
    )
    args = parser.parse_args()
    # Get query
    query = args.query or args.query_arg
    if args.query_file:
        try:
            with open(args.query_file, 'r', encoding='utf-8') as f:
                query = f.read().strip()
        except Exception as e:
            print(f'Error reading query file: {e}', file=sys.stderr)
            sys.exit(1)
    if not query:
        parser.print_help()
        sys.exit(1)
    # Parse publication types
    pub_types = None
    if args.publication_types:
        pub_types = [pt.strip() for pt in args.publication_types.split(',')]
    # Search PubMed
    searcher = PubMedSearcher(api_key=args.api_key, email=args.email)
    pmids = searcher.search(
        query,
        max_results=args.limit,
        date_start=args.date_start,
        date_end=args.date_end,
        publication_types=pub_types
    )
    if not pmids:
        print('No results found', file=sys.stderr)
        sys.exit(1)
    # Fetch metadata
    metadata_list = searcher.fetch_metadata(pmids)
    # Format output
    if args.format == 'json':
        output = json.dumps({
            'query': query,
            'count': len(metadata_list),
            'results': metadata_list
        }, indent=2)
    else:  # bibtex
        bibtex_entries = [searcher.metadata_to_bibtex(m) for m in metadata_list]
        output = '\n\n'.join(bibtex_entries) + '\n'
    # Write output
    if args.output:
        with open(args.output, 'w', encoding='utf-8') as f:
            f.write(output)
        print(f'Wrote {len(metadata_list)} results to {args.output}', file=sys.stderr)
    else:
        print(output)
 if __name__ == '__main__':
    main()
--- a/scripts/validate_citations.py
+++ b/scripts/validate_citations.py
@@ -0,0 +1,497 @@
 #!/usr/bin/env python3
 """
 Citation Validation Tool
 Validate BibTeX files for accuracy, completeness, and format compliance.
 """
 import sys
 import re
 import requests
 import argparse
 import json
 from typing import Dict, List, Tuple, Optional
 from collections import defaultdict
 class CitationValidator:
    """Validate BibTeX entries for errors and inconsistencies."""
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'CitationValidator/1.0 (Citation Management Tool)'
        })
        # Required fields by entry type
        self.required_fields = {
            'article': ['author', 'title', 'journal', 'year'],
            'book': ['title', 'publisher', 'year'],  # author OR editor
            'inproceedings': ['author', 'title', 'booktitle', 'year'],
            'incollection': ['author', 'title', 'booktitle', 'publisher', 'year'],
            'phdthesis': ['author', 'title', 'school', 'year'],
            'mastersthesis': ['author', 'title', 'school', 'year'],
            'techreport': ['author', 'title', 'institution', 'year'],
            'misc': ['title', 'year']
        }
        # Recommended fields
        self.recommended_fields = {
            'article': ['volume', 'pages', 'doi'],
            'book': ['isbn'],
            'inproceedings': ['pages'],
        }
    def parse_bibtex_file(self, filepath: str) -> List[Dict]:
        """
        Parse BibTeX file and extract entries.
        Args:
            filepath: Path to BibTeX file
        Returns:
            List of entry dictionaries
        """
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read()
        except Exception as e:
            print(f'Error reading file: {e}', file=sys.stderr)
            return []
        entries = []
        # Match BibTeX entries
        pattern = r'@(\w+)\s*\{\s*([^,\s]+)\s*,(.*?)\n\}'
        matches = re.finditer(pattern, content, re.DOTALL | re.IGNORECASE)
        for match in matches:
            entry_type = match.group(1).lower()
            citation_key = match.group(2).strip()
            fields_text = match.group(3)
            # Parse fields
            fields = {}
            field_pattern = r'(\w+)\s*=\s*\{([^}]*)\}|(\w+)\s*=\s*"([^"]*)"'
            field_matches = re.finditer(field_pattern, fields_text)
            for field_match in field_matches:
                if field_match.group(1):
                    field_name = field_match.group(1).lower()
                    field_value = field_match.group(2)
                else:
                    field_name = field_match.group(3).lower()
                    field_value = field_match.group(4)
                fields[field_name] = field_value.strip()
            entries.append({
                'type': entry_type,
                'key': citation_key,
                'fields': fields,
                'raw': match.group(0)
            })
        return entries
    def validate_entry(self, entry: Dict) -> Tuple[List[Dict], List[Dict]]:
        """
        Validate a single BibTeX entry.
        Args:
            entry: Entry dictionary
        Returns:
            Tuple of (errors, warnings)
        """
        errors = []
        warnings = []
        entry_type = entry['type']
        key = entry['key']
        fields = entry['fields']
        # Check required fields
        if entry_type in self.required_fields:
            for req_field in self.required_fields[entry_type]:
                if req_field not in fields or not fields[req_field]:
                    # Special case: book can have author OR editor
                    if entry_type == 'book' and req_field == 'author':
                        if 'editor' not in fields or not fields['editor']:
                            errors.append({
                                'type': 'missing_required_field',
                                'field': 'author or editor',
                                'severity': 'high',
                                'message': f'Entry {key}: Missing required field "author" or "editor"'
                            })
                    else:
                        errors.append({
                            'type': 'missing_required_field',
                            'field': req_field,
                            'severity': 'high',
                            'message': f'Entry {key}: Missing required field "{req_field}"'
                        })
        # Check recommended fields
        if entry_type in self.recommended_fields:
            for rec_field in self.recommended_fields[entry_type]:
                if rec_field not in fields or not fields[rec_field]:
                    warnings.append({
                        'type': 'missing_recommended_field',
                        'field': rec_field,
                        'severity': 'medium',
                        'message': f'Entry {key}: Missing recommended field "{rec_field}"'
                    })
        # Validate year
        if 'year' in fields:
            year = fields['year']
            if not re.match(r'^\d{4}$', year):
                errors.append({
                    'type': 'invalid_year',
                    'field': 'year',
                    'value': year,
                    'severity': 'high',
                    'message': f'Entry {key}: Invalid year format "{year}" (should be 4 digits)'
                })
            elif int(year) < 1600 or int(year) > 2030:
                warnings.append({
                    'type': 'suspicious_year',
                    'field': 'year',
                    'value': year,
                    'severity': 'medium',
                    'message': f'Entry {key}: Suspicious year "{year}" (outside reasonable range)'
                })
        # Validate DOI format
        if 'doi' in fields:
            doi = fields['doi']
            if not re.match(r'^10\.\d{4,}/[^\s]+$', doi):
                warnings.append({
                    'type': 'invalid_doi_format',
                    'field': 'doi',
                    'value': doi,
                    'severity': 'medium',
                    'message': f'Entry {key}: Invalid DOI format "{doi}"'
                })
        # Check for single hyphen in pages (should be --)
        if 'pages' in fields:
            pages = fields['pages']
            if re.search(r'\d-\d', pages) and '--' not in pages:
                warnings.append({
                    'type': 'page_range_format',
                    'field': 'pages',
                    'value': pages,
                    'severity': 'low',
                    'message': f'Entry {key}: Page range uses single hyphen, should use -- (en-dash)'
                })
        # Check author format
        if 'author' in fields:
            author = fields['author']
            if ';' in author or '&' in author:
                errors.append({
                    'type': 'invalid_author_format',
                    'field': 'author',
                    'severity': 'high',
                    'message': f'Entry {key}: Authors should be separated by " and ", not ";" or "&"'
                })
        return errors, warnings
    def verify_doi(self, doi: str) -> Tuple[bool, Optional[Dict]]:
        """
        Verify DOI resolves correctly and get metadata.
        Args:
            doi: Digital Object Identifier
        Returns:
            Tuple of (is_valid, metadata)
        """
        try:
            url = f'https://doi.org/{doi}'
            response = self.session.head(url, timeout=10, allow_redirects=True)
            if response.status_code < 400:
                # DOI resolves, now get metadata from CrossRef
                crossref_url = f'https://api.crossref.org/works/{doi}'
                metadata_response = self.session.get(crossref_url, timeout=10)
                if metadata_response.status_code == 200:
                    data = metadata_response.json()
                    message = data.get('message', {})
                    # Extract key metadata
                    metadata = {
                        'title': message.get('title', [''])[0],
                        'year': self._extract_year_crossref(message),
                        'authors': self._format_authors_crossref(message.get('author', [])),
                    }
                    return True, metadata
                else:
                    return True, None  # DOI resolves but no CrossRef metadata
            else:
                return False, None
        except Exception:
            return False, None
    def detect_duplicates(self, entries: List[Dict]) -> List[Dict]:
        """
        Detect duplicate entries.
        Args:
            entries: List of entry dictionaries
        Returns:
            List of duplicate groups
        """
        duplicates = []
        # Check for duplicate DOIs
        doi_map = defaultdict(list)
        for entry in entries:
            doi = entry['fields'].get('doi', '').strip()
            if doi:
                doi_map[doi].append(entry['key'])
        for doi, keys in doi_map.items():
            if len(keys) > 1:
                duplicates.append({
                    'type': 'duplicate_doi',
                    'doi': doi,
                    'entries': keys,
                    'severity': 'high',
                    'message': f'Duplicate DOI {doi} found in entries: {", ".join(keys)}'
                })
        # Check for duplicate citation keys
        key_counts = defaultdict(int)
        for entry in entries:
            key_counts[entry['key']] += 1
        for key, count in key_counts.items():
            if count > 1:
                duplicates.append({
                    'type': 'duplicate_key',
                    'key': key,
                    'count': count,
                    'severity': 'high',
                    'message': f'Citation key "{key}" appears {count} times'
                })
        # Check for similar titles (possible duplicates)
        titles = {}
        for entry in entries:
            title = entry['fields'].get('title', '').lower()
            title = re.sub(r'[^\w\s]', '', title)  # Remove punctuation
            title = ' '.join(title.split())  # Normalize whitespace
            if title:
                if title in titles:
                    duplicates.append({
                        'type': 'similar_title',
                        'entries': [titles[title], entry['key']],
                        'severity': 'medium',
                        'message': f'Possible duplicate: "{titles[title]}" and "{entry["key"]}" have identical titles'
                    })
                else:
                    titles[title] = entry['key']
        return duplicates
    def validate_file(self, filepath: str, check_dois: bool = False) -> Dict:
        """
        Validate entire BibTeX file.
        Args:
            filepath: Path to BibTeX file
            check_dois: Whether to verify DOIs (slow)
        Returns:
            Validation report dictionary
        """
        print(f'Parsing {filepath}...', file=sys.stderr)
        entries = self.parse_bibtex_file(filepath)
        if not entries:
            return {
                'total_entries': 0,
                'errors': [],
                'warnings': [],
                'duplicates': []
            }
        print(f'Found {len(entries)} entries', file=sys.stderr)
        all_errors = []
        all_warnings = []
        # Validate each entry
        for i, entry in enumerate(entries):
            print(f'Validating entry {i+1}/{len(entries)}: {entry["key"]}', file=sys.stderr)
            errors, warnings = self.validate_entry(entry)
            for error in errors:
                error['entry'] = entry['key']
                all_errors.append(error)
            for warning in warnings:
                warning['entry'] = entry['key']
                all_warnings.append(warning)
        # Check for duplicates
        print('Checking for duplicates...', file=sys.stderr)
        duplicates = self.detect_duplicates(entries)
        # Verify DOIs if requested
        doi_errors = []
        if check_dois:
            print('Verifying DOIs...', file=sys.stderr)
            for i, entry in enumerate(entries):
                doi = entry['fields'].get('doi', '')
                if doi:
                    print(f'Verifying DOI {i+1}: {doi}', file=sys.stderr)
                    is_valid, metadata = self.verify_doi(doi)
                    if not is_valid:
                        doi_errors.append({
                            'type': 'invalid_doi',
                            'entry': entry['key'],
                            'doi': doi,
                            'severity': 'high',
                            'message': f'Entry {entry["key"]}: DOI does not resolve: {doi}'
                        })
        all_errors.extend(doi_errors)
        return {
            'filepath': filepath,
            'total_entries': len(entries),
            'valid_entries': len(entries) - len([e for e in all_errors if e['severity'] == 'high']),
            'errors': all_errors,
            'warnings': all_warnings,
            'duplicates': duplicates
        }
    def _extract_year_crossref(self, message: Dict) -> str:
        """Extract year from CrossRef message."""
        date_parts = message.get('published-print', {}).get('date-parts', [[]])
        if not date_parts or not date_parts[0]:
            date_parts = message.get('published-online', {}).get('date-parts', [[]])
        if date_parts and date_parts[0]:
            return str(date_parts[0][0])
        return ''
    def _format_authors_crossref(self, authors: List[Dict]) -> str:
        """Format author list from CrossRef."""
        if not authors:
            return ''
        formatted = []
        for author in authors[:3]:  # First 3 authors
            given = author.get('given', '')
            family = author.get('family', '')
            if family:
                formatted.append(f'{family}, {given}' if given else family)
        if len(authors) > 3:
            formatted.append('et al.')
        return ', '.join(formatted)
 def main():
    """Command-line interface."""
    parser = argparse.ArgumentParser(
        description='Validate BibTeX files for errors and inconsistencies',
        epilog='Example: python validate_citations.py references.bib'
    )
    parser.add_argument(
        'file',
        help='BibTeX file to validate'
    )
    parser.add_argument(
        '--check-dois',
        action='store_true',
        help='Verify DOIs resolve correctly (slow)'
    )
    parser.add_argument(
        '--auto-fix',
        action='store_true',
        help='Attempt to auto-fix common issues (not implemented yet)'
    )
    parser.add_argument(
        '--report',
        help='Output file for JSON validation report'
    )
    parser.add_argument(
        '--verbose',
        action='store_true',
        help='Show detailed output'
    )
    args = parser.parse_args()
    # Validate file
    validator = CitationValidator()
    report = validator.validate_file(args.file, check_dois=args.check_dois)
    # Print summary
    print('\n' + '='*60)
    print('CITATION VALIDATION REPORT')
    print('='*60)
    print(f'\nFile: {args.file}')
    print(f'Total entries: {report["total_entries"]}')
    print(f'Valid entries: {report["valid_entries"]}')
    print(f'Errors: {len(report["errors"])}')
    print(f'Warnings: {len(report["warnings"])}')
    print(f'Duplicates: {len(report["duplicates"])}')
    # Print errors
    if report['errors']:
        print('\n' + '-'*60)
        print('ERRORS (must fix):')
        print('-'*60)
        for error in report['errors']:
            print(f'\n{error["message"]}')
            if args.verbose:
                print(f'  Type: {error["type"]}')
                print(f'  Severity: {error["severity"]}')
    # Print warnings
    if report['warnings'] and args.verbose:
        print('\n' + '-'*60)
        print('WARNINGS (should fix):')
        print('-'*60)
        for warning in report['warnings']:
            print(f'\n{warning["message"]}')
    # Print duplicates
    if report['duplicates']:
        print('\n' + '-'*60)
        print('DUPLICATES:')
        print('-'*60)
        for dup in report['duplicates']:
            print(f'\n{dup["message"]}')
    # Save report
    if args.report:
        with open(args.report, 'w', encoding='utf-8') as f:
            json.dump(report, f, indent=2)
        print(f'\nDetailed report saved to: {args.report}')
    # Exit with error code if there are errors
    if report['errors']:
        sys.exit(1)
 if __name__ == '__main__':
    main()