feat: metagpt-Python-PDF-Feedback-UI-20260326
This commit is contained in:
1
.dependencies.json
Normal file
1
.dependencies.json
Normal file
@@ -0,0 +1 @@
|
||||
{"docs/system_design/20260326175651.json": ["docs/prd/20260326175651.json"], "docs/task/20260326175651.json": ["docs/system_design/20260326175651.json"], "pdf_opener/main.py": ["docs/system_design/20260326175651.json", "docs/task/20260326175651.json"], "tests/test_main.py": ["pdf_opener/main.py"]}
|
||||
1
docs/prd/20260326175651.json
Normal file
1
docs/prd/20260326175651.json
Normal file
@@ -0,0 +1 @@
|
||||
{"Language":"zh_cn","Programming Language":"Python","Original Requirements":"一个Python程序打开PDF文件,不需要UI,尽量简单","Project Name":"pdf_opener","Product Goals":["实现简单高效地打开和读取PDF文件","保持代码简洁,无需任何UI界面","提供基本的PDF内容提取功能"],"User Stories":["作为用户,我希望能够通过命令行指定PDF文件路径并打开它","作为用户,我希望能够提取PDF文件中的文本内容","作为用户,我希望程序能够处理文件不存在或格式错误的异常情况","作为用户,我希望能够查看PDF的基本信息,如页数等"],"Competitive Analysis":["PyPDF2: 纯Python实现,支持PDF读取和基本操作,但对复杂PDF支持有限","pdfplumber: 功能强大,支持文本和表格提取,但依赖较多","pdfminer.six: 文本提取精准,但API较复杂","pymupdf (fitz): 性能优秀,功能全面,但库体积较大","pikepdf: 基于QPDF,支持PDF读写,功能较全面","pypdf: PyPDF2的继任者,持续维护,API简洁"],"Competitive Quadrant Chart":"quadrantChart\n title \"PDF处理库的易用性与功能性对比\"\n x-axis \"低易用性\" --> \"高易用性\"\n y-axis \"低功能性\" --> \"高功能性\"\n quadrant-1 \"首选方案\"\n quadrant-2 \"功能强但复杂\"\n quadrant-3 \"不推荐\"\n quadrant-4 \"简单但功能有限\"\n \"PyPDF2\": [0.55, 0.35]\n \"pdfplumber\": [0.45, 0.75]\n \"pdfminer.six\": [0.25, 0.70]\n \"pymupdf\": [0.60, 0.85]\n \"pikepdf\": [0.40, 0.65]\n \"pypdf\": [0.70, 0.50]\n \"目标方案\": [0.80, 0.45]","Requirement Analysis":"该项目需求非常简洁明确:使用Python实现一个能够打开PDF文件的程序,无需任何图形界面(UI),以命令行方式运行。核心功能包括:接受PDF文件路径作为输入参数,打开并读取PDF文件内容,提取文本信息并输出到控制台,同时处理常见异常(如文件不存在、非PDF格式等)。推荐使用pypdf或PyPDF2库,因其安装简单、API直观,完全满足\"尽量简单\"的要求。","Requirement Pool":[["P0","实现通过命令行参数接收PDF文件路径并打开文件"],["P0","使用第三方库(如pypdf)读取PDF文件内容并提取文本"],["P1","输出PDF基本信息,如总页数、每页文本内容"],["P1","处理异常情况:文件不存在、文件格式错误、权限不足等"],["P2","支持指定页码范围进行内容提取"]],"UI Design draft":"无UI界面。程序通过命令行运行,格式为:python pdf_opener.py <pdf文件路径>。输出结果直接打印到终端,包括PDF页数信息及各页提取的文本内容。","Anything UNCLEAR":"目前需求较为清晰。唯一待确认的是:程序是否只需打开并显示文本内容,还是需要支持其他操作(如复制、搜索、保存等)?当前按最简单的文本提取功能实现。"}
|
||||
3
docs/requirement.txt
Normal file
3
docs/requirement.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
一个Python程序打开PDF文件
|
||||
|
||||
[Feedback] 不需要UI,尽量简单
|
||||
1
docs/system_design/20260326175651.json
Normal file
1
docs/system_design/20260326175651.json
Normal file
@@ -0,0 +1 @@
|
||||
{"Implementation approach":"使用pypdf库实现PDF文件的读取和文本提取,该库安装简单、API直观,完全满足'尽量简单'的要求。程序通过Python标准库argparse接收命令行参数,支持指定PDF文件路径和可选的页码范围。异常处理使用Python内置异常机制,覆盖文件不存在、格式错误、权限不足等常见情况。整个程序只需一个主文件,依赖最小化。","File list":["main.py"],"Data structures and interfaces":"\nclassDiagram\n class PDFOpener {\n -str file_path\n -int start_page\n -int end_page\n +__init__(file_path: str, start_page: int, end_page: int)\n +open() PdfReader\n +get_info(reader: PdfReader) dict\n +extract_text(reader: PdfReader) list[str]\n +display_info(info: dict) None\n +display_text(texts: list[str]) None\n +run() None\n }\n class Main {\n +parse_args() argparse.Namespace\n +main() None\n }\n Main --> PDFOpener : creates and uses\n","Program call flow":"\nsequenceDiagram\n participant M as Main\n participant AP as argparse\n participant PO as PDFOpener\n participant PR as PdfReader(pypdf)\n\n M->>AP: parse_args()\n AP-->>M: args(file_path, start_page, end_page)\n M->>PO: __init__(file_path, start_page, end_page)\n M->>PO: run()\n PO->>PO: open()\n PO->>PR: PdfReader(file_path)\n PR-->>PO: reader\n PO->>PO: get_info(reader)\n PR-->>PO: info(num_pages, metadata)\n PO->>PO: display_info(info)\n PO-->>M: print info to console\n PO->>PO: extract_text(reader)\n loop for each page in range\n PR-->>PO: page.extract_text()\n end\n PO->>PO: display_text(texts)\n PO-->>M: print text to console\n","Anything UNCLEAR":"当前需求已较为清晰,按最简单的文本提取功能实现。程序仅支持文本内容提取和基本信息展示,不支持复制、搜索、保存等高级操作。若后续需要支持加密PDF的解密或表格提取,可考虑升级为pdfplumber或pymupdf库。"}
|
||||
1
docs/task/20260326175651.json
Normal file
1
docs/task/20260326175651.json
Normal file
@@ -0,0 +1 @@
|
||||
{"Required packages":["pypdf==3.17.4"],"Required Other language third-party packages":["无需其他语言的第三方依赖"],"Logic Analysis":[["main.py","包含 PDFOpener 类和 Main 相关函数。PDFOpener 类需实现以下方法:__init__(file_path: str, start_page: int, end_page: int) 用于初始化文件路径和页码范围;open() 使用 pypdf.PdfReader 打开PDF文件并返回 reader 对象,需处理文件不存在、格式错误、权限不足等异常;get_info(reader: PdfReader) 获取PDF基本信息(总页数、元数据等)并返回 dict;display_info(info: dict) 将PDF信息格式化输出到控制台;extract_text(reader: PdfReader) 按页码范围逐页提取文本,返回 list[str];display_text(texts: list[str]) 将提取的文本内容输出到控制台;run() 作为主流程入口,依次调用 open、get_info、display_info、extract_text、display_text。Main 部分包含 parse_args() 函数,使用 argparse 解析命令行参数(file_path 必填,start_page 和 end_page 可选),以及 main() 函数作为程序入口,创建 PDFOpener 实例并调用 run()。依赖导入:pypdf、argparse(标准库)、sys(标准库)。"]],"Task list":["main.py"],"Full API spec":"","Shared Knowledge":"`main.py` 是唯一的程序文件,包含所有逻辑。PDFOpener 类封装了PDF读取、信息提取和文本提取的全部功能。命令行参数规范:file_path 为位置参数(必填),--start-page 和 --end-page 为可选参数(默认值分别为第1页和最后一页)。异常处理统一在 open() 方法中捕获 FileNotFoundError、pypdf.errors.PdfReadError 及 PermissionError,并以友好的错误信息输出后退出程序。","Anything UNCLEAR":"当前需求已较为清晰。页码范围参数的默认行为需确认:start_page 默认为0(第一页),end_page 默认为None(表示读取到最后一页)。程序仅支持文本内容提取和基本信息展示,不支持加密PDF解密、表格提取、搜索或保存等高级功能,若后续有此类需求可考虑升级为 pdfplumber 或 pymupdf 库。"}
|
||||
272
pdf_opener/main.py
Normal file
272
pdf_opener/main.py
Normal file
@@ -0,0 +1,272 @@
|
||||
"""PDF文件读取和文本提取工具。
|
||||
|
||||
使用pypdf库实现PDF文件的读取、基本信息展示和文本内容提取。
|
||||
支持通过命令行参数指定PDF文件路径和可选的页码范围。
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import pypdf
|
||||
from pypdf import PdfReader
|
||||
|
||||
|
||||
class PDFOpener:
|
||||
"""PDF文件打开和内容提取类。
|
||||
|
||||
封装了PDF读取、信息提取和文本提取的全部功能。
|
||||
|
||||
Attributes:
|
||||
file_path: PDF文件的路径。
|
||||
start_page: 提取文本的起始页码(从0开始,包含)。
|
||||
end_page: 提取文本的结束页码(从0开始,包含),None表示最后一页。
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
file_path: str,
|
||||
start_page: int = 0,
|
||||
end_page: Optional[int] = None,
|
||||
) -> None:
|
||||
"""初始化PDFOpener实例。
|
||||
|
||||
Args:
|
||||
file_path: PDF文件的路径。
|
||||
start_page: 提取文本的起始页码(从0开始),默认为0。
|
||||
end_page: 提取文本的结束页码(从0开始),默认为None(最后一页)。
|
||||
"""
|
||||
self._file_path: str = file_path
|
||||
self._start_page: int = start_page
|
||||
self._end_page: Optional[int] = end_page
|
||||
|
||||
def open(self) -> PdfReader:
|
||||
"""打开PDF文件并返回PdfReader对象。
|
||||
|
||||
处理文件不存在、格式错误、权限不足等常见异常,
|
||||
遇到异常时输出友好错误信息并退出程序。
|
||||
|
||||
Returns:
|
||||
pypdf.PdfReader对象。
|
||||
|
||||
Raises:
|
||||
SystemExit: 当文件不存在、格式错误或权限不足时退出程序。
|
||||
"""
|
||||
try:
|
||||
reader = PdfReader(self._file_path)
|
||||
return reader
|
||||
except FileNotFoundError:
|
||||
print(
|
||||
f"错误:文件未找到,请检查路径是否正确:'{self._file_path}'",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
except pypdf.errors.PdfReadError as e:
|
||||
print(
|
||||
f"错误:无法读取PDF文件,文件可能已损坏或格式不正确:{e}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
except PermissionError:
|
||||
print(
|
||||
f"错误:没有权限读取文件:'{self._file_path}'",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
def get_info(self, reader: PdfReader) -> dict:
|
||||
"""获取PDF文件的基本信息。
|
||||
|
||||
Args:
|
||||
reader: pypdf.PdfReader对象。
|
||||
|
||||
Returns:
|
||||
包含PDF基本信息的字典,包括总页数和元数据。
|
||||
"""
|
||||
num_pages: int = len(reader.pages)
|
||||
metadata: Optional[pypdf.DocumentInformation] = reader.metadata
|
||||
|
||||
info: dict = {
|
||||
"num_pages": num_pages,
|
||||
"title": metadata.title if metadata and metadata.title else "未知",
|
||||
"author": metadata.author if metadata and metadata.author else "未知",
|
||||
"subject": metadata.subject if metadata and metadata.subject else "未知",
|
||||
"creator": metadata.creator if metadata and metadata.creator else "未知",
|
||||
"producer": metadata.producer if metadata and metadata.producer else "未知",
|
||||
"creation_date": (
|
||||
metadata.creation_date if metadata and metadata.creation_date else "未知"
|
||||
),
|
||||
}
|
||||
return info
|
||||
|
||||
def display_info(self, info: dict) -> None:
|
||||
"""将PDF基本信息格式化输出到控制台。
|
||||
|
||||
Args:
|
||||
info: 包含PDF基本信息的字典。
|
||||
"""
|
||||
separator: str = "=" * 50
|
||||
print(separator)
|
||||
print("PDF 文件基本信息")
|
||||
print(separator)
|
||||
print(f" 总页数 : {info.get('num_pages', '未知')}")
|
||||
print(f" 标题 : {info.get('title', '未知')}")
|
||||
print(f" 作者 : {info.get('author', '未知')}")
|
||||
print(f" 主题 : {info.get('subject', '未知')}")
|
||||
print(f" 创建工具 : {info.get('creator', '未知')}")
|
||||
print(f" 生成工具 : {info.get('producer', '未知')}")
|
||||
print(f" 创建日期 : {info.get('creation_date', '未知')}")
|
||||
print(separator)
|
||||
print()
|
||||
|
||||
def extract_text(self, reader: PdfReader) -> list[str]:
|
||||
"""按页码范围逐页提取PDF文本内容。
|
||||
|
||||
Args:
|
||||
reader: pypdf.PdfReader对象。
|
||||
|
||||
Returns:
|
||||
每页文本内容组成的列表,列表索引对应页码偏移。
|
||||
"""
|
||||
num_pages: int = len(reader.pages)
|
||||
|
||||
# 确定实际的起始和结束页码(基于0的索引)
|
||||
actual_start: int = max(0, self._start_page)
|
||||
actual_end: int = (
|
||||
num_pages - 1 if self._end_page is None else min(self._end_page, num_pages - 1)
|
||||
)
|
||||
|
||||
if actual_start > actual_end:
|
||||
print(
|
||||
f"警告:起始页码 ({actual_start + 1}) 大于结束页码 ({actual_end + 1}),"
|
||||
f"将不提取任何文本。",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return []
|
||||
|
||||
texts: list[str] = []
|
||||
for page_index in range(actual_start, actual_end + 1):
|
||||
page = reader.pages[page_index]
|
||||
page_text: str = page.extract_text() or ""
|
||||
texts.append(page_text)
|
||||
|
||||
return texts
|
||||
|
||||
def display_text(self, texts: list[str]) -> None:
|
||||
"""将提取的文本内容格式化输出到控制台。
|
||||
|
||||
Args:
|
||||
texts: 每页文本内容组成的列表。
|
||||
"""
|
||||
if not texts:
|
||||
print("未提取到任何文本内容。")
|
||||
return
|
||||
|
||||
separator: str = "-" * 50
|
||||
actual_start_display: int = self._start_page + 1 # 转换为1-based显示
|
||||
|
||||
for i, text in enumerate(texts):
|
||||
page_number: int = actual_start_display + i
|
||||
print(f"【第 {page_number} 页】")
|
||||
print(separator)
|
||||
if text.strip():
|
||||
print(text)
|
||||
else:
|
||||
print("(本页无可提取的文本内容)")
|
||||
print(separator)
|
||||
print()
|
||||
|
||||
def run(self) -> None:
|
||||
"""主流程入口,依次执行PDF读取、信息展示和文本提取。
|
||||
|
||||
按顺序调用 open、get_info、display_info、extract_text、display_text。
|
||||
"""
|
||||
reader: PdfReader = self.open()
|
||||
info: dict = self.get_info(reader)
|
||||
self.display_info(info)
|
||||
texts: list[str] = self.extract_text(reader)
|
||||
self.display_text(texts)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
"""解析命令行参数。
|
||||
|
||||
Returns:
|
||||
包含解析后参数的Namespace对象:
|
||||
- file_path: PDF文件路径(必填)
|
||||
- start_page: 起始页码,1-based(可选,默认为1)
|
||||
- end_page: 结束页码,1-based(可选,默认为None表示最后一页)
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="PDF文件读取和文本提取工具",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=(
|
||||
"示例用法:\n"
|
||||
" python main.py document.pdf\n"
|
||||
" python main.py document.pdf --start-page 2 --end-page 5\n"
|
||||
" python main.py document.pdf --start-page 3\n"
|
||||
),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"file_path",
|
||||
type=str,
|
||||
help="PDF文件的路径(必填)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--start-page",
|
||||
type=int,
|
||||
default=1,
|
||||
dest="start_page",
|
||||
metavar="N",
|
||||
help="提取文本的起始页码(从1开始,默认为1)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--end-page",
|
||||
type=int,
|
||||
default=None,
|
||||
dest="end_page",
|
||||
metavar="N",
|
||||
help="提取文本的结束页码(从1开始,默认为最后一页)",
|
||||
)
|
||||
|
||||
args: argparse.Namespace = parser.parse_args()
|
||||
|
||||
# 验证页码参数合法性
|
||||
if args.start_page < 1:
|
||||
parser.error("--start-page 必须大于等于1")
|
||||
|
||||
if args.end_page is not None and args.end_page < 1:
|
||||
parser.error("--end-page 必须大于等于1")
|
||||
|
||||
if args.end_page is not None and args.start_page > args.end_page:
|
||||
parser.error("--start-page 不能大于 --end-page")
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""程序主入口函数。
|
||||
|
||||
解析命令行参数,创建PDFOpener实例并执行主流程。
|
||||
"""
|
||||
args: argparse.Namespace = parse_args()
|
||||
|
||||
# 将1-based的用户输入页码转换为0-based的内部索引
|
||||
start_page_index: int = args.start_page - 1
|
||||
end_page_index: Optional[int] = (
|
||||
args.end_page - 1 if args.end_page is not None else None
|
||||
)
|
||||
|
||||
pdf_opener = PDFOpener(
|
||||
file_path=args.file_path,
|
||||
start_page=start_page_index,
|
||||
end_page=end_page_index,
|
||||
)
|
||||
pdf_opener.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@@ -0,0 +1 @@
|
||||
pypdf==3.17.4
|
||||
15
resources/competitive_analysis/20260326175651.mmd
Normal file
15
resources/competitive_analysis/20260326175651.mmd
Normal file
@@ -0,0 +1,15 @@
|
||||
quadrantChart
|
||||
title "PDF处理库的易用性与功能性对比"
|
||||
x-axis "低易用性" --> "高易用性"
|
||||
y-axis "低功能性" --> "高功能性"
|
||||
quadrant-1 "首选方案"
|
||||
quadrant-2 "功能强但复杂"
|
||||
quadrant-3 "不推荐"
|
||||
quadrant-4 "简单但功能有限"
|
||||
"PyPDF2": [0.55, 0.35]
|
||||
"pdfplumber": [0.45, 0.75]
|
||||
"pdfminer.six": [0.25, 0.70]
|
||||
"pymupdf": [0.60, 0.85]
|
||||
"pikepdf": [0.40, 0.65]
|
||||
"pypdf": [0.70, 0.50]
|
||||
"目标方案": [0.80, 0.45]
|
||||
19
resources/data_api_design/20260326175651.mmd
Normal file
19
resources/data_api_design/20260326175651.mmd
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
classDiagram
|
||||
class PDFOpener {
|
||||
-str file_path
|
||||
-int start_page
|
||||
-int end_page
|
||||
+__init__(file_path: str, start_page: int, end_page: int)
|
||||
+open() PdfReader
|
||||
+get_info(reader: PdfReader) dict
|
||||
+extract_text(reader: PdfReader) list[str]
|
||||
+display_info(info: dict) None
|
||||
+display_text(texts: list[str]) None
|
||||
+run() None
|
||||
}
|
||||
class Main {
|
||||
+parse_args() argparse.Namespace
|
||||
+main() None
|
||||
}
|
||||
Main --> PDFOpener : creates and uses
|
||||
76
resources/prd/20260326175651.md
Normal file
76
resources/prd/20260326175651.md
Normal file
@@ -0,0 +1,76 @@
|
||||
## Language
|
||||
|
||||
zh_cn
|
||||
|
||||
## Programming Language
|
||||
|
||||
Python
|
||||
|
||||
## Original Requirements
|
||||
|
||||
一个Python程序打开PDF文件,不需要UI,尽量简单
|
||||
|
||||
## Project Name
|
||||
|
||||
pdf_opener
|
||||
|
||||
## Product Goals
|
||||
|
||||
- 实现简单高效地打开和读取PDF文件
|
||||
- 保持代码简洁,无需任何UI界面
|
||||
- 提供基本的PDF内容提取功能
|
||||
|
||||
## User Stories
|
||||
|
||||
- 作为用户,我希望能够通过命令行指定PDF文件路径并打开它
|
||||
- 作为用户,我希望能够提取PDF文件中的文本内容
|
||||
- 作为用户,我希望程序能够处理文件不存在或格式错误的异常情况
|
||||
- 作为用户,我希望能够查看PDF的基本信息,如页数等
|
||||
|
||||
## Competitive Analysis
|
||||
|
||||
- PyPDF2: 纯Python实现,支持PDF读取和基本操作,但对复杂PDF支持有限
|
||||
- pdfplumber: 功能强大,支持文本和表格提取,但依赖较多
|
||||
- pdfminer.six: 文本提取精准,但API较复杂
|
||||
- pymupdf (fitz): 性能优秀,功能全面,但库体积较大
|
||||
- pikepdf: 基于QPDF,支持PDF读写,功能较全面
|
||||
- pypdf: PyPDF2的继任者,持续维护,API简洁
|
||||
|
||||
## Competitive Quadrant Chart
|
||||
|
||||
quadrantChart
|
||||
title "PDF处理库的易用性与功能性对比"
|
||||
x-axis "低易用性" --> "高易用性"
|
||||
y-axis "低功能性" --> "高功能性"
|
||||
quadrant-1 "首选方案"
|
||||
quadrant-2 "功能强但复杂"
|
||||
quadrant-3 "不推荐"
|
||||
quadrant-4 "简单但功能有限"
|
||||
"PyPDF2": [0.55, 0.35]
|
||||
"pdfplumber": [0.45, 0.75]
|
||||
"pdfminer.six": [0.25, 0.70]
|
||||
"pymupdf": [0.60, 0.85]
|
||||
"pikepdf": [0.40, 0.65]
|
||||
"pypdf": [0.70, 0.50]
|
||||
"目标方案": [0.80, 0.45]
|
||||
|
||||
## Requirement Analysis
|
||||
|
||||
该项目需求非常简洁明确:使用Python实现一个能够打开PDF文件的程序,无需任何图形界面(UI),以命令行方式运行。核心功能包括:接受PDF文件路径作为输入参数,打开并读取PDF文件内容,提取文本信息并输出到控制台,同时处理常见异常(如文件不存在、非PDF格式等)。推荐使用pypdf或PyPDF2库,因其安装简单、API直观,完全满足"尽量简单"的要求。
|
||||
|
||||
## Requirement Pool
|
||||
|
||||
- ['P0', '实现通过命令行参数接收PDF文件路径并打开文件']
|
||||
- ['P0', '使用第三方库(如pypdf)读取PDF文件内容并提取文本']
|
||||
- ['P1', '输出PDF基本信息,如总页数、每页文本内容']
|
||||
- ['P1', '处理异常情况:文件不存在、文件格式错误、权限不足等']
|
||||
- ['P2', '支持指定页码范围进行内容提取']
|
||||
|
||||
## UI Design draft
|
||||
|
||||
无UI界面。程序通过命令行运行,格式为:python pdf_opener.py <pdf文件路径>。输出结果直接打印到终端,包括PDF页数信息及各页提取的文本内容。
|
||||
|
||||
## Anything UNCLEAR
|
||||
|
||||
目前需求较为清晰。唯一待确认的是:程序是否只需打开并显示文本内容,还是需要支持其他操作(如复制、搜索、保存等)?当前按最简单的文本提取功能实现。
|
||||
|
||||
24
resources/seq_flow/20260326175651.mmd
Normal file
24
resources/seq_flow/20260326175651.mmd
Normal file
@@ -0,0 +1,24 @@
|
||||
|
||||
sequenceDiagram
|
||||
participant M as Main
|
||||
participant AP as argparse
|
||||
participant PO as PDFOpener
|
||||
participant PR as PdfReader(pypdf)
|
||||
|
||||
M->>AP: parse_args()
|
||||
AP-->>M: args(file_path, start_page, end_page)
|
||||
M->>PO: __init__(file_path, start_page, end_page)
|
||||
M->>PO: run()
|
||||
PO->>PO: open()
|
||||
PO->>PR: PdfReader(file_path)
|
||||
PR-->>PO: reader
|
||||
PO->>PO: get_info(reader)
|
||||
PR-->>PO: info(num_pages, metadata)
|
||||
PO->>PO: display_info(info)
|
||||
PO-->>M: print info to console
|
||||
PO->>PO: extract_text(reader)
|
||||
loop for each page in range
|
||||
PR-->>PO: page.extract_text()
|
||||
end
|
||||
PO->>PO: display_text(texts)
|
||||
PO-->>M: print text to console
|
||||
63
resources/system_design/20260326175651.md
Normal file
63
resources/system_design/20260326175651.md
Normal file
@@ -0,0 +1,63 @@
|
||||
## Implementation approach
|
||||
|
||||
使用pypdf库实现PDF文件的读取和文本提取,该库安装简单、API直观,完全满足'尽量简单'的要求。程序通过Python标准库argparse接收命令行参数,支持指定PDF文件路径和可选的页码范围。异常处理使用Python内置异常机制,覆盖文件不存在、格式错误、权限不足等常见情况。整个程序只需一个主文件,依赖最小化。
|
||||
|
||||
## File list
|
||||
|
||||
- main.py
|
||||
|
||||
## Data structures and interfaces
|
||||
|
||||
|
||||
classDiagram
|
||||
class PDFOpener {
|
||||
-str file_path
|
||||
-int start_page
|
||||
-int end_page
|
||||
+__init__(file_path: str, start_page: int, end_page: int)
|
||||
+open() PdfReader
|
||||
+get_info(reader: PdfReader) dict
|
||||
+extract_text(reader: PdfReader) list[str]
|
||||
+display_info(info: dict) None
|
||||
+display_text(texts: list[str]) None
|
||||
+run() None
|
||||
}
|
||||
class Main {
|
||||
+parse_args() argparse.Namespace
|
||||
+main() None
|
||||
}
|
||||
Main --> PDFOpener : creates and uses
|
||||
|
||||
|
||||
## Program call flow
|
||||
|
||||
|
||||
sequenceDiagram
|
||||
participant M as Main
|
||||
participant AP as argparse
|
||||
participant PO as PDFOpener
|
||||
participant PR as PdfReader(pypdf)
|
||||
|
||||
M->>AP: parse_args()
|
||||
AP-->>M: args(file_path, start_page, end_page)
|
||||
M->>PO: __init__(file_path, start_page, end_page)
|
||||
M->>PO: run()
|
||||
PO->>PO: open()
|
||||
PO->>PR: PdfReader(file_path)
|
||||
PR-->>PO: reader
|
||||
PO->>PO: get_info(reader)
|
||||
PR-->>PO: info(num_pages, metadata)
|
||||
PO->>PO: display_info(info)
|
||||
PO-->>M: print info to console
|
||||
PO->>PO: extract_text(reader)
|
||||
loop for each page in range
|
||||
PR-->>PO: page.extract_text()
|
||||
end
|
||||
PO->>PO: display_text(texts)
|
||||
PO-->>M: print text to console
|
||||
|
||||
|
||||
## Anything UNCLEAR
|
||||
|
||||
当前需求已较为清晰,按最简单的文本提取功能实现。程序仅支持文本内容提取和基本信息展示,不支持复制、搜索、保存等高级操作。若后续需要支持加密PDF的解密或表格提取,可考虑升级为pdfplumber或pymupdf库。
|
||||
|
||||
360
tests/test_main.py
Normal file
360
tests/test_main.py
Normal file
@@ -0,0 +1,360 @@
|
||||
## test_main.py
|
||||
|
||||
```python
|
||||
"""
|
||||
Comprehensive test suite for pdf_opener/main.py.
|
||||
|
||||
Tests cover PDFOpener class methods, parse_args function,
|
||||
edge cases, error handling, and the main entry point.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import unittest
|
||||
from io import StringIO
|
||||
from typing import Optional
|
||||
from unittest.mock import MagicMock, patch, PropertyMock
|
||||
|
||||
# Adjust import path since tests run from /data
|
||||
sys.path.insert(0, "/data")
|
||||
|
||||
from pdf_opener.main import PDFOpener, parse_args, main # noqa: E402
|
||||
|
||||
|
||||
## Helper Factories
|
||||
|
||||
def make_mock_page(text: str = "Sample page text") -> MagicMock:
|
||||
"""Create a mock PDF page with configurable extract_text return value."""
|
||||
page = MagicMock()
|
||||
page.extract_text.return_value = text
|
||||
return page
|
||||
|
||||
|
||||
def make_mock_metadata(
|
||||
title: Optional[str] = "Test Title",
|
||||
author: Optional[str] = "Test Author",
|
||||
subject: Optional[str] = "Test Subject",
|
||||
creator: Optional[str] = "Test Creator",
|
||||
producer: Optional[str] = "Test Producer",
|
||||
creation_date: Optional[str] = "2024-01-01",
|
||||
) -> MagicMock:
|
||||
"""Create a mock PDF metadata object."""
|
||||
metadata = MagicMock()
|
||||
metadata.title = title
|
||||
metadata.author = author
|
||||
metadata.subject = subject
|
||||
metadata.creator = creator
|
||||
metadata.producer = producer
|
||||
metadata.creation_date = creation_date
|
||||
return metadata
|
||||
|
||||
|
||||
def make_mock_reader(
|
||||
num_pages: int = 3,
|
||||
page_texts: Optional[list] = None,
|
||||
metadata: Optional[MagicMock] = None,
|
||||
) -> MagicMock:
|
||||
"""Create a mock PdfReader with configurable pages and metadata."""
|
||||
reader = MagicMock()
|
||||
|
||||
if page_texts is None:
|
||||
page_texts = [f"Page {i + 1} content" for i in range(num_pages)]
|
||||
|
||||
pages = [make_mock_page(text) for text in page_texts]
|
||||
reader.pages = pages
|
||||
reader.metadata = metadata if metadata is not None else make_mock_metadata()
|
||||
return reader
|
||||
|
||||
|
||||
## TestPDFOpenerInit
|
||||
|
||||
class TestPDFOpenerInit(unittest.TestCase):
|
||||
"""Tests for PDFOpener.__init__ method."""
|
||||
|
||||
def test_default_initialization(self) -> None:
|
||||
"""Test PDFOpener initializes with correct default values."""
|
||||
opener = PDFOpener(file_path="test.pdf")
|
||||
self.assertEqual(opener._file_path, "test.pdf")
|
||||
self.assertEqual(opener._start_page, 0)
|
||||
self.assertIsNone(opener._end_page)
|
||||
|
||||
def test_custom_start_page(self) -> None:
|
||||
"""Test PDFOpener initializes with custom start_page."""
|
||||
opener = PDFOpener(file_path="test.pdf", start_page=2)
|
||||
self.assertEqual(opener._start_page, 2)
|
||||
|
||||
def test_custom_end_page(self) -> None:
|
||||
"""Test PDFOpener initializes with custom end_page."""
|
||||
opener = PDFOpener(file_path="test.pdf", end_page=5)
|
||||
self.assertEqual(opener._end_page, 5)
|
||||
|
||||
def test_full_custom_initialization(self) -> None:
|
||||
"""Test PDFOpener initializes with all custom values."""
|
||||
opener = PDFOpener(file_path="/path/to/doc.pdf", start_page=1, end_page=4)
|
||||
self.assertEqual(opener._file_path, "/path/to/doc.pdf")
|
||||
self.assertEqual(opener._start_page, 1)
|
||||
self.assertEqual(opener._end_page, 4)
|
||||
|
||||
def test_file_path_type_is_str(self) -> None:
|
||||
"""Test that file_path is stored as string."""
|
||||
opener = PDFOpener(file_path="sample.pdf")
|
||||
self.assertIsInstance(opener._file_path, str)
|
||||
|
||||
def test_start_page_type_is_int(self) -> None:
|
||||
"""Test that start_page is stored as int."""
|
||||
opener = PDFOpener(file_path="sample.pdf", start_page=3)
|
||||
self.assertIsInstance(opener._start_page, int)
|
||||
|
||||
def test_end_page_none_by_default(self) -> None:
|
||||
"""Test that end_page defaults to None."""
|
||||
opener = PDFOpener(file_path="sample.pdf")
|
||||
self.assertIsNone(opener._end_page)
|
||||
|
||||
|
||||
## TestPDFOpenerOpen
|
||||
|
||||
class TestPDFOpenerOpen(unittest.TestCase):
|
||||
"""Tests for PDFOpener.open method."""
|
||||
|
||||
@patch("pdf_opener.main.PdfReader")
|
||||
def test_open_returns_reader_on_success(self, mock_pdf_reader: MagicMock) -> None:
|
||||
"""Test that open() returns a PdfReader instance on success."""
|
||||
mock_reader_instance = MagicMock()
|
||||
mock_pdf_reader.return_value = mock_reader_instance
|
||||
|
||||
opener = PDFOpener(file_path="valid.pdf")
|
||||
result = opener.open()
|
||||
|
||||
mock_pdf_reader.assert_called_once_with("valid.pdf")
|
||||
self.assertEqual(result, mock_reader_instance)
|
||||
|
||||
@patch("pdf_opener.main.PdfReader")
|
||||
def test_open_file_not_found_exits(self, mock_pdf_reader: MagicMock) -> None:
|
||||
"""Test that open() calls sys.exit(1) when file is not found."""
|
||||
mock_pdf_reader.side_effect = FileNotFoundError
|
||||
|
||||
opener = PDFOpener(file_path="nonexistent.pdf")
|
||||
with self.assertRaises(SystemExit) as context:
|
||||
opener.open()
|
||||
self.assertEqual(context.exception.code, 1)
|
||||
|
||||
@patch("pdf_opener.main.PdfReader")
|
||||
def test_open_file_not_found_prints_error(self, mock_pdf_reader: MagicMock) -> None:
|
||||
"""Test that open() prints error message to stderr on FileNotFoundError."""
|
||||
mock_pdf_reader.side_effect = FileNotFoundError
|
||||
|
||||
opener = PDFOpener(file_path="nonexistent.pdf")
|
||||
with self.assertRaises(SystemExit):
|
||||
with patch("sys.stderr", new_callable=StringIO) as mock_stderr:
|
||||
opener.open()
|
||||
self.assertIn("nonexistent.pdf", mock_stderr.getvalue())
|
||||
|
||||
@patch("pdf_opener.main.PdfReader")
|
||||
def test_open_pdf_read_error_exits(self, mock_pdf_reader: MagicMock) -> None:
|
||||
"""Test that open() calls sys.exit(1) on PdfReadError."""
|
||||
import pypdf
|
||||
mock_pdf_reader.side_effect = pypdf.errors.PdfReadError("corrupted")
|
||||
|
||||
opener = PDFOpener(file_path="corrupted.pdf")
|
||||
with self.assertRaises(SystemExit) as context:
|
||||
opener.open()
|
||||
self.assertEqual(context.exception.code, 1)
|
||||
|
||||
@patch("pdf_opener.main.PdfReader")
|
||||
def test_open_pdf_read_error_prints_error(self, mock_pdf_reader: MagicMock) -> None:
|
||||
"""Test that open() prints error message to stderr on PdfReadError."""
|
||||
import pypdf
|
||||
mock_pdf_reader.side_effect = pypdf.errors.PdfReadError("corrupted")
|
||||
|
||||
opener = PDFOpener(file_path="corrupted.pdf")
|
||||
with self.assertRaises(SystemExit):
|
||||
with patch("sys.stderr", new_callable=StringIO) as mock_stderr:
|
||||
opener.open()
|
||||
self.assertIn("corrupted", mock_stderr.getvalue())
|
||||
|
||||
@patch("pdf_opener.main.PdfReader")
|
||||
def test_open_permission_error_exits(self, mock_pdf_reader: MagicMock) -> None:
|
||||
"""Test that open() calls sys.exit(1) on PermissionError."""
|
||||
mock_pdf_reader.side_effect = PermissionError
|
||||
|
||||
opener = PDFOpener(file_path="protected.pdf")
|
||||
with self.assertRaises(SystemExit) as context:
|
||||
opener.open()
|
||||
self.assertEqual(context.exception.code, 1)
|
||||
|
||||
@patch("pdf_opener.main.PdfReader")
|
||||
def test_open_permission_error_prints_error(self, mock_pdf_reader: MagicMock) -> None:
|
||||
"""Test that open() prints error message to stderr on PermissionError."""
|
||||
mock_pdf_reader.side_effect = PermissionError
|
||||
|
||||
opener = PDFOpener(file_path="protected.pdf")
|
||||
with self.assertRaises(SystemExit):
|
||||
with patch("sys.stderr", new_callable=StringIO) as mock_stderr:
|
||||
opener.open()
|
||||
self.assertIn("protected.pdf", mock_stderr.getvalue())
|
||||
|
||||
|
||||
## TestPDFOpenerGetInfo
|
||||
|
||||
class TestPDFOpenerGetInfo(unittest.TestCase):
|
||||
"""Tests for PDFOpener.get_info method."""
|
||||
|
||||
def setUp(self) -> None:
|
||||
"""Set up common test fixtures."""
|
||||
self.opener = PDFOpener(file_path="test.pdf")
|
||||
|
||||
def test_get_info_returns_dict(self) -> None:
|
||||
"""Test that get_info returns a dictionary."""
|
||||
reader = make_mock_reader(num_pages=5)
|
||||
result = self.opener.get_info(reader)
|
||||
self.assertIsInstance(result, dict)
|
||||
|
||||
def test_get_info_num_pages(self) -> None:
|
||||
"""Test that get_info correctly reports number of pages."""
|
||||
reader = make_mock_reader(num_pages=7)
|
||||
result = self.opener.get_info(reader)
|
||||
self.assertEqual(result["num_pages"], 7)
|
||||
|
||||
def test_get_info_with_full_metadata(self) -> None:
|
||||
"""Test get_info with complete metadata."""
|
||||
metadata = make_mock_metadata(
|
||||
title="My PDF",
|
||||
author="John Doe",
|
||||
subject="Testing",
|
||||
creator="Word",
|
||||
producer="Adobe",
|
||||
creation_date="2023-06-15",
|
||||
)
|
||||
reader = make_mock_reader(num_pages=3, metadata=metadata)
|
||||
result = self.opener.get_info(reader)
|
||||
|
||||
self.assertEqual(result["title"], "My PDF")
|
||||
self.assertEqual(result["author"], "John Doe")
|
||||
self.assertEqual(result["subject"], "Testing")
|
||||
self.assertEqual(result["creator"], "Word")
|
||||
self.assertEqual(result["producer"], "Adobe")
|
||||
self.assertEqual(result["creation_date"], "2023-06-15")
|
||||
|
||||
def test_get_info_with_none_metadata(self) -> None:
|
||||
"""Test get_info when metadata is None."""
|
||||
reader = make_mock_reader(num_pages=2, metadata=None)
|
||||
result = self.opener.get_info(reader)
|
||||
|
||||
self.assertEqual(result["title"], "未知")
|
||||
self.assertEqual(result["author"], "未知")
|
||||
self.assertEqual(result["subject"], "未知")
|
||||
self.assertEqual(result["creator"], "未知")
|
||||
self.assertEqual(result["producer"], "未知")
|
||||
self.assertEqual(result["creation_date"], "未知")
|
||||
|
||||
def test_get_info_with_partial_metadata_none_fields(self) -> None:
|
||||
"""Test get_info when some metadata fields are None."""
|
||||
metadata = make_mock_metadata(
|
||||
title=None,
|
||||
author="Known Author",
|
||||
subject=None,
|
||||
creator=None,
|
||||
producer="Known Producer",
|
||||
creation_date=None,
|
||||
)
|
||||
reader = make_mock_reader(num_pages=1, metadata=metadata)
|
||||
result = self.opener.get_info(reader)
|
||||
|
||||
self.assertEqual(result["title"], "未知")
|
||||
self.assertEqual(result["author"], "Known Author")
|
||||
self.assertEqual(result["subject"], "未知")
|
||||
self.assertEqual(result["creator"], "未知")
|
||||
self.assertEqual(result["producer"], "Known Producer")
|
||||
self.assertEqual(result["creation_date"], "未知")
|
||||
|
||||
def test_get_info_contains_all_keys(self) -> None:
|
||||
"""Test that get_info result contains all expected keys."""
|
||||
reader = make_mock_reader(num_pages=1)
|
||||
result = self.opener.get_info(reader)
|
||||
|
||||
expected_keys = [
|
||||
"num_pages", "title", "author", "subject",
|
||||
"creator", "producer", "creation_date"
|
||||
]
|
||||
for key in expected_keys:
|
||||
self.assertIn(key, result)
|
||||
|
||||
def test_get_info_single_page_pdf(self) -> None:
|
||||
"""Test get_info with a single page PDF."""
|
||||
reader = make_mock_reader(num_pages=1)
|
||||
result = self.opener.get_info(reader)
|
||||
self.assertEqual(result["num_pages"], 1)
|
||||
|
||||
|
||||
## TestPDFOpenerDisplayInfo
|
||||
|
||||
class TestPDFOpenerDisplayInfo(unittest.TestCase):
|
||||
"""Tests for PDFOpener.display_info method."""
|
||||
|
||||
def setUp(self) -> None:
|
||||
"""Set up common test fixtures."""
|
||||
self.opener = PDFOpener(file_path="test.pdf")
|
||||
self.sample_info: dict = {
|
||||
"num_pages": 10,
|
||||
"title": "Sample Title",
|
||||
"author": "Sample Author",
|
||||
"subject": "Sample Subject",
|
||||
"creator": "Sample Creator",
|
||||
"producer": "Sample Producer",
|
||||
"creation_date": "2024-01-01",
|
||||
}
|
||||
|
||||
def test_display_info_prints_num_pages(self) -> None:
|
||||
"""Test that display_info prints the number of pages."""
|
||||
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
|
||||
self.opener.display_info(self.sample_info)
|
||||
output = mock_stdout.getvalue()
|
||||
self.assertIn("10", output)
|
||||
|
||||
def test_display_info_prints_title(self) -> None:
|
||||
"""Test that display_info prints the title."""
|
||||
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
|
||||
self.opener.display_info(self.sample_info)
|
||||
output = mock_stdout.getvalue()
|
||||
self.assertIn("Sample Title", output)
|
||||
|
||||
def test_display_info_prints_author(self) -> None:
|
||||
"""Test that display_info prints the author."""
|
||||
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
|
||||
self.opener.display_info(self.sample_info)
|
||||
output = mock_stdout.getvalue()
|
||||
self.assertIn("Sample Author", output)
|
||||
|
||||
def test_display_info_prints_separator(self) -> None:
|
||||
"""Test that display_info prints separator lines."""
|
||||
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
|
||||
self.opener.display_info(self.sample_info)
|
||||
output = mock_stdout.getvalue()
|
||||
self.assertIn("=" * 50, output)
|
||||
|
||||
def test_display_info_prints_header(self) -> None:
|
||||
"""Test that display_info prints the header text."""
|
||||
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
|
||||
self.opener.display_info(self.sample_info)
|
||||
output = mock_stdout.getvalue()
|
||||
self.assertIn("PDF 文件基本信息", output)
|
||||
|
||||
def test_display_info_with_unknown_values(self) -> None:
|
||||
"""Test display_info with unknown/missing values."""
|
||||
info: dict = {
|
||||
"num_pages": 1,
|
||||
"title": "未知",
|
||||
"author": "未知",
|
||||
"subject": "未知",
|
||||
"creator": "未知",
|
||||
"producer": "未知",
|
||||
"creation_date": "未知",
|
||||
}
|
||||
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
|
||||
self.opener.display_info(info)
|
||||
output = mock_stdout.getvalue()
|
||||
self.assertIn("未知", output)
|
||||
|
||||
def test_display_info_with_empty_dict(self) -> None:
|
||||
"""Test display_info with empty dict uses default '未知'."""
|
||||
with patch("sys.stdout", new_callable=StringIO) as
|
||||
Reference in New Issue
Block a user