Files
chinese-novelist-skill/scripts/check_chapter_wordcount.py
2026-01-28 23:58:37 +08:00

179 lines
5.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
章节字数检查脚本
检查指定章节文件的字数低于3000字时提示需要扩充
"""
import os
import re
import sys
from pathlib import Path
# 修复 Windows 控制台编码问题
if sys.platform == 'win32':
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
def count_chinese_words(text: str) -> int:
"""统计中文字数排除标点符号和Markdown标记"""
# 移除Markdown标记
text = re.sub(r'#{1,6}\s*', '', text) # 标题
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) # 粗体
text = re.sub(r'\*(.*?)\*', r'\1', text) # 斜体
text = re.sub(r'~~(.*?)~~', r'\1', text) # 删除线
text = re.sub(r'`(.*?)`', r'\1', text) # 行内代码
text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text) # 链接
# 统计中文字符(汉字)
chinese_chars = re.findall(r'[\u4e00-\u9fff]', text)
return len(chinese_chars)
def extract_content_from_chapter(file_path: Path) -> str:
"""从章节文件中提取正文内容(排除标题等元数据)"""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# 查找正文开始位置(通常是第一个一级标题或二级标题之后)
lines = content.split('\n')
# 跳过开头的元数据(如 # 第XX章 标题)
content_start = 0
for i, line in enumerate(lines):
if line.startswith('#') and '' in line:
content_start = i + 1
break
# 提取正文
main_content = '\n'.join(lines[content_start:])
return main_content
def check_chapter(file_path: str, min_words: int = 3000) -> dict:
"""检查单个章节的字数"""
path = Path(file_path)
if not path.exists():
return {
'file': str(path),
'exists': False,
'word_count': 0,
'status': 'error',
'message': f'文件不存在: {file_path}'
}
main_content = extract_content_from_chapter(path)
word_count = count_chinese_words(main_content)
status = 'pass' if word_count >= min_words else 'fail'
message = f'字数: {word_count}' + (
f' (✓ 达标)' if word_count >= min_words else f' (✗ 不足,需要至少 {min_words} 字)'
)
return {
'file': str(path),
'exists': True,
'word_count': word_count,
'status': status,
'message': message
}
def check_all_chapters(directory: str, pattern: str = '第*.md', min_words: int = 3000) -> list:
"""检查目录下所有符合模式的章节文件"""
dir_path = Path(directory)
if not dir_path.exists():
print(f'错误: 目录不存在 - {directory}')
return []
chapter_files = sorted(dir_path.glob(pattern))
results = []
for chapter_file in chapter_files:
result = check_chapter(str(chapter_file), min_words)
results.append(result)
return results
def print_results(results: list, min_words: int = 3000):
"""打印检查结果"""
if not results:
print('没有找到章节文件')
return
total_words = 0
passed = 0
failed = 0
print('\n' + '=' * 60)
print('章节字数检查报告')
print('=' * 60)
for result in results:
if not result['exists']:
print(f'\n{result["file"]}')
print(f' {result["message"]}')
continue
total_words += result['word_count']
if result['status'] == 'pass':
passed += 1
icon = ''
else:
failed += 1
icon = '⚠️ '
print(f'\n{icon} {Path(result["file"]).name}')
print(f' {result["message"]}')
print('\n' + '-' * 60)
print(f'总计: {len(results)} 章 | {passed} 章达标 | {failed} 章不足 | 总字数: {total_words:,}')
print('-' * 60)
if failed > 0:
print(f'\n⚠️ 有 {failed} 章内容不足 {min_words} 字,建议使用扩充技巧:')
print(' - 添加细节描写(环境、心理、动作)')
print(' - 增加对话场景')
print(' - 扩展人物内心活动')
print(' - 补充背景故事')
print(f'\n 参考: references/content-expansion.md')
def main():
"""主函数"""
min_words = 3000
if len(sys.argv) < 2:
print('用法:')
print(' 检查单个章节: python check_chapter_wordcount.py <章节文件路径> [最小字数]')
print(' 检查所有章节: python check_chapter_wordcount.py --all <目录路径> [最小字数]')
print('')
print('示例:')
print(' python check_chapter_wordcount.py novels/故事/第01章.md')
print(' python check_chapter_wordcount.py novels/故事/第01章.md 3500')
print(' python check_chapter_wordcount.py --all novels/故事')
print(' python check_chapter_wordcount.py --all novels/故事 3500')
return
if sys.argv[1] == '--all':
if len(sys.argv) < 3:
print('错误: 使用 --all 时需要指定目录路径')
return
directory = sys.argv[2]
min_words = int(sys.argv[3]) if len(sys.argv) > 3 else 3000
results = check_all_chapters(directory, min_words=min_words)
print_results(results, min_words)
else:
file_path = sys.argv[1]
min_words = int(sys.argv[2]) if len(sys.argv) > 2 else 3000
result = check_chapter(file_path, min_words)
print_results([result], min_words)
if __name__ == '__main__':
main()