feat: metagpt-Python-PDF-Feedback-UI-20260326

This commit is contained in:
MetaGPT Agent
2026-03-26 18:04:54 +09:00
parent af145eec35
commit f90925795f
13 changed files with 837 additions and 0 deletions

1
.dependencies.json Normal file
View File

@@ -0,0 +1 @@
{"docs/system_design/20260326175651.json": ["docs/prd/20260326175651.json"], "docs/task/20260326175651.json": ["docs/system_design/20260326175651.json"], "pdf_opener/main.py": ["docs/system_design/20260326175651.json", "docs/task/20260326175651.json"], "tests/test_main.py": ["pdf_opener/main.py"]}

View File

@@ -0,0 +1 @@
{"Language":"zh_cn","Programming Language":"Python","Original Requirements":"一个Python程序打开PDF文件不需要UI尽量简单","Project Name":"pdf_opener","Product Goals":["实现简单高效地打开和读取PDF文件","保持代码简洁无需任何UI界面","提供基本的PDF内容提取功能"],"User Stories":["作为用户我希望能够通过命令行指定PDF文件路径并打开它","作为用户我希望能够提取PDF文件中的文本内容","作为用户,我希望程序能够处理文件不存在或格式错误的异常情况","作为用户我希望能够查看PDF的基本信息如页数等"],"Competitive Analysis":["PyPDF2: 纯Python实现支持PDF读取和基本操作但对复杂PDF支持有限","pdfplumber: 功能强大,支持文本和表格提取,但依赖较多","pdfminer.six: 文本提取精准但API较复杂","pymupdf (fitz): 性能优秀,功能全面,但库体积较大","pikepdf: 基于QPDF支持PDF读写功能较全面","pypdf: PyPDF2的继任者持续维护API简洁"],"Competitive Quadrant Chart":"quadrantChart\n title \"PDF处理库的易用性与功能性对比\"\n x-axis \"低易用性\" --> \"高易用性\"\n y-axis \"低功能性\" --> \"高功能性\"\n quadrant-1 \"首选方案\"\n quadrant-2 \"功能强但复杂\"\n quadrant-3 \"不推荐\"\n quadrant-4 \"简单但功能有限\"\n \"PyPDF2\": [0.55, 0.35]\n \"pdfplumber\": [0.45, 0.75]\n \"pdfminer.six\": [0.25, 0.70]\n \"pymupdf\": [0.60, 0.85]\n \"pikepdf\": [0.40, 0.65]\n \"pypdf\": [0.70, 0.50]\n \"目标方案\": [0.80, 0.45]","Requirement Analysis":"该项目需求非常简洁明确使用Python实现一个能够打开PDF文件的程序无需任何图形界面UI以命令行方式运行。核心功能包括接受PDF文件路径作为输入参数打开并读取PDF文件内容提取文本信息并输出到控制台同时处理常见异常如文件不存在、非PDF格式等。推荐使用pypdf或PyPDF2库因其安装简单、API直观完全满足\"尽量简单\"的要求。","Requirement Pool":[["P0","实现通过命令行参数接收PDF文件路径并打开文件"],["P0","使用第三方库如pypdf读取PDF文件内容并提取文本"],["P1","输出PDF基本信息如总页数、每页文本内容"],["P1","处理异常情况:文件不存在、文件格式错误、权限不足等"],["P2","支持指定页码范围进行内容提取"]],"UI Design draft":"无UI界面。程序通过命令行运行格式为python pdf_opener.py <pdf文件路径>。输出结果直接打印到终端包括PDF页数信息及各页提取的文本内容。","Anything UNCLEAR":"目前需求较为清晰。唯一待确认的是:程序是否只需打开并显示文本内容,还是需要支持其他操作(如复制、搜索、保存等)?当前按最简单的文本提取功能实现。"}

3
docs/requirement.txt Normal file
View File

@@ -0,0 +1,3 @@
一个Python程序打开PDF文件
[Feedback] 不需要UI尽量简单

View File

@@ -0,0 +1 @@
{"Implementation approach":"使用pypdf库实现PDF文件的读取和文本提取该库安装简单、API直观完全满足'尽量简单'的要求。程序通过Python标准库argparse接收命令行参数支持指定PDF文件路径和可选的页码范围。异常处理使用Python内置异常机制覆盖文件不存在、格式错误、权限不足等常见情况。整个程序只需一个主文件依赖最小化。","File list":["main.py"],"Data structures and interfaces":"\nclassDiagram\n class PDFOpener {\n -str file_path\n -int start_page\n -int end_page\n +__init__(file_path: str, start_page: int, end_page: int)\n +open() PdfReader\n +get_info(reader: PdfReader) dict\n +extract_text(reader: PdfReader) list[str]\n +display_info(info: dict) None\n +display_text(texts: list[str]) None\n +run() None\n }\n class Main {\n +parse_args() argparse.Namespace\n +main() None\n }\n Main --> PDFOpener : creates and uses\n","Program call flow":"\nsequenceDiagram\n participant M as Main\n participant AP as argparse\n participant PO as PDFOpener\n participant PR as PdfReader(pypdf)\n\n M->>AP: parse_args()\n AP-->>M: args(file_path, start_page, end_page)\n M->>PO: __init__(file_path, start_page, end_page)\n M->>PO: run()\n PO->>PO: open()\n PO->>PR: PdfReader(file_path)\n PR-->>PO: reader\n PO->>PO: get_info(reader)\n PR-->>PO: info(num_pages, metadata)\n PO->>PO: display_info(info)\n PO-->>M: print info to console\n PO->>PO: extract_text(reader)\n loop for each page in range\n PR-->>PO: page.extract_text()\n end\n PO->>PO: display_text(texts)\n PO-->>M: print text to console\n","Anything UNCLEAR":"当前需求已较为清晰按最简单的文本提取功能实现。程序仅支持文本内容提取和基本信息展示不支持复制、搜索、保存等高级操作。若后续需要支持加密PDF的解密或表格提取可考虑升级为pdfplumber或pymupdf库。"}

View File

@@ -0,0 +1 @@
{"Required packages":["pypdf==3.17.4"],"Required Other language third-party packages":["无需其他语言的第三方依赖"],"Logic Analysis":[["main.py","包含 PDFOpener 类和 Main 相关函数。PDFOpener 类需实现以下方法__init__(file_path: str, start_page: int, end_page: int) 用于初始化文件路径和页码范围open() 使用 pypdf.PdfReader 打开PDF文件并返回 reader 对象需处理文件不存在、格式错误、权限不足等异常get_info(reader: PdfReader) 获取PDF基本信息总页数、元数据等并返回 dictdisplay_info(info: dict) 将PDF信息格式化输出到控制台extract_text(reader: PdfReader) 按页码范围逐页提取文本,返回 list[str]display_text(texts: list[str]) 将提取的文本内容输出到控制台run() 作为主流程入口,依次调用 open、get_info、display_info、extract_text、display_text。Main 部分包含 parse_args() 函数,使用 argparse 解析命令行参数file_path 必填start_page 和 end_page 可选),以及 main() 函数作为程序入口,创建 PDFOpener 实例并调用 run()。依赖导入pypdf、argparse标准库、sys标准库。"]],"Task list":["main.py"],"Full API spec":"","Shared Knowledge":"`main.py` 是唯一的程序文件包含所有逻辑。PDFOpener 类封装了PDF读取、信息提取和文本提取的全部功能。命令行参数规范file_path 为位置参数(必填),--start-page 和 --end-page 为可选参数默认值分别为第1页和最后一页。异常处理统一在 open() 方法中捕获 FileNotFoundError、pypdf.errors.PdfReadError 及 PermissionError并以友好的错误信息输出后退出程序。","Anything UNCLEAR":"当前需求已较为清晰。页码范围参数的默认行为需确认start_page 默认为0第一页end_page 默认为None表示读取到最后一页。程序仅支持文本内容提取和基本信息展示不支持加密PDF解密、表格提取、搜索或保存等高级功能若后续有此类需求可考虑升级为 pdfplumber 或 pymupdf 库。"}

272
pdf_opener/main.py Normal file
View File

@@ -0,0 +1,272 @@
"""PDF文件读取和文本提取工具。
使用pypdf库实现PDF文件的读取、基本信息展示和文本内容提取。
支持通过命令行参数指定PDF文件路径和可选的页码范围。
"""
import argparse
import sys
from typing import Optional
import pypdf
from pypdf import PdfReader
class PDFOpener:
"""PDF文件打开和内容提取类。
封装了PDF读取、信息提取和文本提取的全部功能。
Attributes:
file_path: PDF文件的路径。
start_page: 提取文本的起始页码从0开始包含
end_page: 提取文本的结束页码从0开始包含None表示最后一页。
"""
def __init__(
self,
file_path: str,
start_page: int = 0,
end_page: Optional[int] = None,
) -> None:
"""初始化PDFOpener实例。
Args:
file_path: PDF文件的路径。
start_page: 提取文本的起始页码从0开始默认为0。
end_page: 提取文本的结束页码从0开始默认为None最后一页
"""
self._file_path: str = file_path
self._start_page: int = start_page
self._end_page: Optional[int] = end_page
def open(self) -> PdfReader:
"""打开PDF文件并返回PdfReader对象。
处理文件不存在、格式错误、权限不足等常见异常,
遇到异常时输出友好错误信息并退出程序。
Returns:
pypdf.PdfReader对象。
Raises:
SystemExit: 当文件不存在、格式错误或权限不足时退出程序。
"""
try:
reader = PdfReader(self._file_path)
return reader
except FileNotFoundError:
print(
f"错误:文件未找到,请检查路径是否正确:'{self._file_path}'",
file=sys.stderr,
)
sys.exit(1)
except pypdf.errors.PdfReadError as e:
print(
f"错误无法读取PDF文件文件可能已损坏或格式不正确{e}",
file=sys.stderr,
)
sys.exit(1)
except PermissionError:
print(
f"错误:没有权限读取文件:'{self._file_path}'",
file=sys.stderr,
)
sys.exit(1)
def get_info(self, reader: PdfReader) -> dict:
"""获取PDF文件的基本信息。
Args:
reader: pypdf.PdfReader对象。
Returns:
包含PDF基本信息的字典包括总页数和元数据。
"""
num_pages: int = len(reader.pages)
metadata: Optional[pypdf.DocumentInformation] = reader.metadata
info: dict = {
"num_pages": num_pages,
"title": metadata.title if metadata and metadata.title else "未知",
"author": metadata.author if metadata and metadata.author else "未知",
"subject": metadata.subject if metadata and metadata.subject else "未知",
"creator": metadata.creator if metadata and metadata.creator else "未知",
"producer": metadata.producer if metadata and metadata.producer else "未知",
"creation_date": (
metadata.creation_date if metadata and metadata.creation_date else "未知"
),
}
return info
def display_info(self, info: dict) -> None:
"""将PDF基本信息格式化输出到控制台。
Args:
info: 包含PDF基本信息的字典。
"""
separator: str = "=" * 50
print(separator)
print("PDF 文件基本信息")
print(separator)
print(f" 总页数 : {info.get('num_pages', '未知')}")
print(f" 标题 : {info.get('title', '未知')}")
print(f" 作者 : {info.get('author', '未知')}")
print(f" 主题 : {info.get('subject', '未知')}")
print(f" 创建工具 : {info.get('creator', '未知')}")
print(f" 生成工具 : {info.get('producer', '未知')}")
print(f" 创建日期 : {info.get('creation_date', '未知')}")
print(separator)
print()
def extract_text(self, reader: PdfReader) -> list[str]:
"""按页码范围逐页提取PDF文本内容。
Args:
reader: pypdf.PdfReader对象。
Returns:
每页文本内容组成的列表,列表索引对应页码偏移。
"""
num_pages: int = len(reader.pages)
# 确定实际的起始和结束页码基于0的索引
actual_start: int = max(0, self._start_page)
actual_end: int = (
num_pages - 1 if self._end_page is None else min(self._end_page, num_pages - 1)
)
if actual_start > actual_end:
print(
f"警告:起始页码 ({actual_start + 1}) 大于结束页码 ({actual_end + 1})"
f"将不提取任何文本。",
file=sys.stderr,
)
return []
texts: list[str] = []
for page_index in range(actual_start, actual_end + 1):
page = reader.pages[page_index]
page_text: str = page.extract_text() or ""
texts.append(page_text)
return texts
def display_text(self, texts: list[str]) -> None:
"""将提取的文本内容格式化输出到控制台。
Args:
texts: 每页文本内容组成的列表。
"""
if not texts:
print("未提取到任何文本内容。")
return
separator: str = "-" * 50
actual_start_display: int = self._start_page + 1 # 转换为1-based显示
for i, text in enumerate(texts):
page_number: int = actual_start_display + i
print(f"【第 {page_number} 页】")
print(separator)
if text.strip():
print(text)
else:
print("(本页无可提取的文本内容)")
print(separator)
print()
def run(self) -> None:
"""主流程入口依次执行PDF读取、信息展示和文本提取。
按顺序调用 open、get_info、display_info、extract_text、display_text。
"""
reader: PdfReader = self.open()
info: dict = self.get_info(reader)
self.display_info(info)
texts: list[str] = self.extract_text(reader)
self.display_text(texts)
def parse_args() -> argparse.Namespace:
"""解析命令行参数。
Returns:
包含解析后参数的Namespace对象
- file_path: PDF文件路径必填
- start_page: 起始页码1-based可选默认为1
- end_page: 结束页码1-based可选默认为None表示最后一页
"""
parser = argparse.ArgumentParser(
description="PDF文件读取和文本提取工具",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=(
"示例用法:\n"
" python main.py document.pdf\n"
" python main.py document.pdf --start-page 2 --end-page 5\n"
" python main.py document.pdf --start-page 3\n"
),
)
parser.add_argument(
"file_path",
type=str,
help="PDF文件的路径必填",
)
parser.add_argument(
"--start-page",
type=int,
default=1,
dest="start_page",
metavar="N",
help="提取文本的起始页码从1开始默认为1",
)
parser.add_argument(
"--end-page",
type=int,
default=None,
dest="end_page",
metavar="N",
help="提取文本的结束页码从1开始默认为最后一页",
)
args: argparse.Namespace = parser.parse_args()
# 验证页码参数合法性
if args.start_page < 1:
parser.error("--start-page 必须大于等于1")
if args.end_page is not None and args.end_page < 1:
parser.error("--end-page 必须大于等于1")
if args.end_page is not None and args.start_page > args.end_page:
parser.error("--start-page 不能大于 --end-page")
return args
def main() -> None:
"""程序主入口函数。
解析命令行参数创建PDFOpener实例并执行主流程。
"""
args: argparse.Namespace = parse_args()
# 将1-based的用户输入页码转换为0-based的内部索引
start_page_index: int = args.start_page - 1
end_page_index: Optional[int] = (
args.end_page - 1 if args.end_page is not None else None
)
pdf_opener = PDFOpener(
file_path=args.file_path,
start_page=start_page_index,
end_page=end_page_index,
)
pdf_opener.run()
if __name__ == "__main__":
main()

1
requirements.txt Normal file
View File

@@ -0,0 +1 @@
pypdf==3.17.4

View File

@@ -0,0 +1,15 @@
quadrantChart
title "PDF处理库的易用性与功能性对比"
x-axis "低易用性" --> "高易用性"
y-axis "低功能性" --> "高功能性"
quadrant-1 "首选方案"
quadrant-2 "功能强但复杂"
quadrant-3 "不推荐"
quadrant-4 "简单但功能有限"
"PyPDF2": [0.55, 0.35]
"pdfplumber": [0.45, 0.75]
"pdfminer.six": [0.25, 0.70]
"pymupdf": [0.60, 0.85]
"pikepdf": [0.40, 0.65]
"pypdf": [0.70, 0.50]
"目标方案": [0.80, 0.45]

View File

@@ -0,0 +1,19 @@
classDiagram
class PDFOpener {
-str file_path
-int start_page
-int end_page
+__init__(file_path: str, start_page: int, end_page: int)
+open() PdfReader
+get_info(reader: PdfReader) dict
+extract_text(reader: PdfReader) list[str]
+display_info(info: dict) None
+display_text(texts: list[str]) None
+run() None
}
class Main {
+parse_args() argparse.Namespace
+main() None
}
Main --> PDFOpener : creates and uses

View File

@@ -0,0 +1,76 @@
## Language
zh_cn
## Programming Language
Python
## Original Requirements
一个Python程序打开PDF文件不需要UI尽量简单
## Project Name
pdf_opener
## Product Goals
- 实现简单高效地打开和读取PDF文件
- 保持代码简洁无需任何UI界面
- 提供基本的PDF内容提取功能
## User Stories
- 作为用户我希望能够通过命令行指定PDF文件路径并打开它
- 作为用户我希望能够提取PDF文件中的文本内容
- 作为用户,我希望程序能够处理文件不存在或格式错误的异常情况
- 作为用户我希望能够查看PDF的基本信息如页数等
## Competitive Analysis
- PyPDF2: 纯Python实现支持PDF读取和基本操作但对复杂PDF支持有限
- pdfplumber: 功能强大,支持文本和表格提取,但依赖较多
- pdfminer.six: 文本提取精准但API较复杂
- pymupdf (fitz): 性能优秀,功能全面,但库体积较大
- pikepdf: 基于QPDF支持PDF读写功能较全面
- pypdf: PyPDF2的继任者持续维护API简洁
## Competitive Quadrant Chart
quadrantChart
title "PDF处理库的易用性与功能性对比"
x-axis "低易用性" --> "高易用性"
y-axis "低功能性" --> "高功能性"
quadrant-1 "首选方案"
quadrant-2 "功能强但复杂"
quadrant-3 "不推荐"
quadrant-4 "简单但功能有限"
"PyPDF2": [0.55, 0.35]
"pdfplumber": [0.45, 0.75]
"pdfminer.six": [0.25, 0.70]
"pymupdf": [0.60, 0.85]
"pikepdf": [0.40, 0.65]
"pypdf": [0.70, 0.50]
"目标方案": [0.80, 0.45]
## Requirement Analysis
该项目需求非常简洁明确使用Python实现一个能够打开PDF文件的程序无需任何图形界面UI以命令行方式运行。核心功能包括接受PDF文件路径作为输入参数打开并读取PDF文件内容提取文本信息并输出到控制台同时处理常见异常如文件不存在、非PDF格式等。推荐使用pypdf或PyPDF2库因其安装简单、API直观完全满足"尽量简单"的要求。
## Requirement Pool
- ['P0', '实现通过命令行参数接收PDF文件路径并打开文件']
- ['P0', '使用第三方库如pypdf读取PDF文件内容并提取文本']
- ['P1', '输出PDF基本信息如总页数、每页文本内容']
- ['P1', '处理异常情况:文件不存在、文件格式错误、权限不足等']
- ['P2', '支持指定页码范围进行内容提取']
## UI Design draft
无UI界面。程序通过命令行运行格式为python pdf_opener.py <pdf文件路径>。输出结果直接打印到终端包括PDF页数信息及各页提取的文本内容。
## Anything UNCLEAR
目前需求较为清晰。唯一待确认的是:程序是否只需打开并显示文本内容,还是需要支持其他操作(如复制、搜索、保存等)?当前按最简单的文本提取功能实现。

View File

@@ -0,0 +1,24 @@
sequenceDiagram
participant M as Main
participant AP as argparse
participant PO as PDFOpener
participant PR as PdfReader(pypdf)
M->>AP: parse_args()
AP-->>M: args(file_path, start_page, end_page)
M->>PO: __init__(file_path, start_page, end_page)
M->>PO: run()
PO->>PO: open()
PO->>PR: PdfReader(file_path)
PR-->>PO: reader
PO->>PO: get_info(reader)
PR-->>PO: info(num_pages, metadata)
PO->>PO: display_info(info)
PO-->>M: print info to console
PO->>PO: extract_text(reader)
loop for each page in range
PR-->>PO: page.extract_text()
end
PO->>PO: display_text(texts)
PO-->>M: print text to console

View File

@@ -0,0 +1,63 @@
## Implementation approach
使用pypdf库实现PDF文件的读取和文本提取该库安装简单、API直观完全满足'尽量简单'的要求。程序通过Python标准库argparse接收命令行参数支持指定PDF文件路径和可选的页码范围。异常处理使用Python内置异常机制覆盖文件不存在、格式错误、权限不足等常见情况。整个程序只需一个主文件依赖最小化。
## File list
- main.py
## Data structures and interfaces
classDiagram
class PDFOpener {
-str file_path
-int start_page
-int end_page
+__init__(file_path: str, start_page: int, end_page: int)
+open() PdfReader
+get_info(reader: PdfReader) dict
+extract_text(reader: PdfReader) list[str]
+display_info(info: dict) None
+display_text(texts: list[str]) None
+run() None
}
class Main {
+parse_args() argparse.Namespace
+main() None
}
Main --> PDFOpener : creates and uses
## Program call flow
sequenceDiagram
participant M as Main
participant AP as argparse
participant PO as PDFOpener
participant PR as PdfReader(pypdf)
M->>AP: parse_args()
AP-->>M: args(file_path, start_page, end_page)
M->>PO: __init__(file_path, start_page, end_page)
M->>PO: run()
PO->>PO: open()
PO->>PR: PdfReader(file_path)
PR-->>PO: reader
PO->>PO: get_info(reader)
PR-->>PO: info(num_pages, metadata)
PO->>PO: display_info(info)
PO-->>M: print info to console
PO->>PO: extract_text(reader)
loop for each page in range
PR-->>PO: page.extract_text()
end
PO->>PO: display_text(texts)
PO-->>M: print text to console
## Anything UNCLEAR
当前需求已较为清晰按最简单的文本提取功能实现。程序仅支持文本内容提取和基本信息展示不支持复制、搜索、保存等高级操作。若后续需要支持加密PDF的解密或表格提取可考虑升级为pdfplumber或pymupdf库。

360
tests/test_main.py Normal file
View File

@@ -0,0 +1,360 @@
## test_main.py
```python
"""
Comprehensive test suite for pdf_opener/main.py.
Tests cover PDFOpener class methods, parse_args function,
edge cases, error handling, and the main entry point.
"""
import argparse
import sys
import unittest
from io import StringIO
from typing import Optional
from unittest.mock import MagicMock, patch, PropertyMock
# Adjust import path since tests run from /data
sys.path.insert(0, "/data")
from pdf_opener.main import PDFOpener, parse_args, main # noqa: E402
## Helper Factories
def make_mock_page(text: str = "Sample page text") -> MagicMock:
"""Create a mock PDF page with configurable extract_text return value."""
page = MagicMock()
page.extract_text.return_value = text
return page
def make_mock_metadata(
title: Optional[str] = "Test Title",
author: Optional[str] = "Test Author",
subject: Optional[str] = "Test Subject",
creator: Optional[str] = "Test Creator",
producer: Optional[str] = "Test Producer",
creation_date: Optional[str] = "2024-01-01",
) -> MagicMock:
"""Create a mock PDF metadata object."""
metadata = MagicMock()
metadata.title = title
metadata.author = author
metadata.subject = subject
metadata.creator = creator
metadata.producer = producer
metadata.creation_date = creation_date
return metadata
def make_mock_reader(
num_pages: int = 3,
page_texts: Optional[list] = None,
metadata: Optional[MagicMock] = None,
) -> MagicMock:
"""Create a mock PdfReader with configurable pages and metadata."""
reader = MagicMock()
if page_texts is None:
page_texts = [f"Page {i + 1} content" for i in range(num_pages)]
pages = [make_mock_page(text) for text in page_texts]
reader.pages = pages
reader.metadata = metadata if metadata is not None else make_mock_metadata()
return reader
## TestPDFOpenerInit
class TestPDFOpenerInit(unittest.TestCase):
"""Tests for PDFOpener.__init__ method."""
def test_default_initialization(self) -> None:
"""Test PDFOpener initializes with correct default values."""
opener = PDFOpener(file_path="test.pdf")
self.assertEqual(opener._file_path, "test.pdf")
self.assertEqual(opener._start_page, 0)
self.assertIsNone(opener._end_page)
def test_custom_start_page(self) -> None:
"""Test PDFOpener initializes with custom start_page."""
opener = PDFOpener(file_path="test.pdf", start_page=2)
self.assertEqual(opener._start_page, 2)
def test_custom_end_page(self) -> None:
"""Test PDFOpener initializes with custom end_page."""
opener = PDFOpener(file_path="test.pdf", end_page=5)
self.assertEqual(opener._end_page, 5)
def test_full_custom_initialization(self) -> None:
"""Test PDFOpener initializes with all custom values."""
opener = PDFOpener(file_path="/path/to/doc.pdf", start_page=1, end_page=4)
self.assertEqual(opener._file_path, "/path/to/doc.pdf")
self.assertEqual(opener._start_page, 1)
self.assertEqual(opener._end_page, 4)
def test_file_path_type_is_str(self) -> None:
"""Test that file_path is stored as string."""
opener = PDFOpener(file_path="sample.pdf")
self.assertIsInstance(opener._file_path, str)
def test_start_page_type_is_int(self) -> None:
"""Test that start_page is stored as int."""
opener = PDFOpener(file_path="sample.pdf", start_page=3)
self.assertIsInstance(opener._start_page, int)
def test_end_page_none_by_default(self) -> None:
"""Test that end_page defaults to None."""
opener = PDFOpener(file_path="sample.pdf")
self.assertIsNone(opener._end_page)
## TestPDFOpenerOpen
class TestPDFOpenerOpen(unittest.TestCase):
"""Tests for PDFOpener.open method."""
@patch("pdf_opener.main.PdfReader")
def test_open_returns_reader_on_success(self, mock_pdf_reader: MagicMock) -> None:
"""Test that open() returns a PdfReader instance on success."""
mock_reader_instance = MagicMock()
mock_pdf_reader.return_value = mock_reader_instance
opener = PDFOpener(file_path="valid.pdf")
result = opener.open()
mock_pdf_reader.assert_called_once_with("valid.pdf")
self.assertEqual(result, mock_reader_instance)
@patch("pdf_opener.main.PdfReader")
def test_open_file_not_found_exits(self, mock_pdf_reader: MagicMock) -> None:
"""Test that open() calls sys.exit(1) when file is not found."""
mock_pdf_reader.side_effect = FileNotFoundError
opener = PDFOpener(file_path="nonexistent.pdf")
with self.assertRaises(SystemExit) as context:
opener.open()
self.assertEqual(context.exception.code, 1)
@patch("pdf_opener.main.PdfReader")
def test_open_file_not_found_prints_error(self, mock_pdf_reader: MagicMock) -> None:
"""Test that open() prints error message to stderr on FileNotFoundError."""
mock_pdf_reader.side_effect = FileNotFoundError
opener = PDFOpener(file_path="nonexistent.pdf")
with self.assertRaises(SystemExit):
with patch("sys.stderr", new_callable=StringIO) as mock_stderr:
opener.open()
self.assertIn("nonexistent.pdf", mock_stderr.getvalue())
@patch("pdf_opener.main.PdfReader")
def test_open_pdf_read_error_exits(self, mock_pdf_reader: MagicMock) -> None:
"""Test that open() calls sys.exit(1) on PdfReadError."""
import pypdf
mock_pdf_reader.side_effect = pypdf.errors.PdfReadError("corrupted")
opener = PDFOpener(file_path="corrupted.pdf")
with self.assertRaises(SystemExit) as context:
opener.open()
self.assertEqual(context.exception.code, 1)
@patch("pdf_opener.main.PdfReader")
def test_open_pdf_read_error_prints_error(self, mock_pdf_reader: MagicMock) -> None:
"""Test that open() prints error message to stderr on PdfReadError."""
import pypdf
mock_pdf_reader.side_effect = pypdf.errors.PdfReadError("corrupted")
opener = PDFOpener(file_path="corrupted.pdf")
with self.assertRaises(SystemExit):
with patch("sys.stderr", new_callable=StringIO) as mock_stderr:
opener.open()
self.assertIn("corrupted", mock_stderr.getvalue())
@patch("pdf_opener.main.PdfReader")
def test_open_permission_error_exits(self, mock_pdf_reader: MagicMock) -> None:
"""Test that open() calls sys.exit(1) on PermissionError."""
mock_pdf_reader.side_effect = PermissionError
opener = PDFOpener(file_path="protected.pdf")
with self.assertRaises(SystemExit) as context:
opener.open()
self.assertEqual(context.exception.code, 1)
@patch("pdf_opener.main.PdfReader")
def test_open_permission_error_prints_error(self, mock_pdf_reader: MagicMock) -> None:
"""Test that open() prints error message to stderr on PermissionError."""
mock_pdf_reader.side_effect = PermissionError
opener = PDFOpener(file_path="protected.pdf")
with self.assertRaises(SystemExit):
with patch("sys.stderr", new_callable=StringIO) as mock_stderr:
opener.open()
self.assertIn("protected.pdf", mock_stderr.getvalue())
## TestPDFOpenerGetInfo
class TestPDFOpenerGetInfo(unittest.TestCase):
"""Tests for PDFOpener.get_info method."""
def setUp(self) -> None:
"""Set up common test fixtures."""
self.opener = PDFOpener(file_path="test.pdf")
def test_get_info_returns_dict(self) -> None:
"""Test that get_info returns a dictionary."""
reader = make_mock_reader(num_pages=5)
result = self.opener.get_info(reader)
self.assertIsInstance(result, dict)
def test_get_info_num_pages(self) -> None:
"""Test that get_info correctly reports number of pages."""
reader = make_mock_reader(num_pages=7)
result = self.opener.get_info(reader)
self.assertEqual(result["num_pages"], 7)
def test_get_info_with_full_metadata(self) -> None:
"""Test get_info with complete metadata."""
metadata = make_mock_metadata(
title="My PDF",
author="John Doe",
subject="Testing",
creator="Word",
producer="Adobe",
creation_date="2023-06-15",
)
reader = make_mock_reader(num_pages=3, metadata=metadata)
result = self.opener.get_info(reader)
self.assertEqual(result["title"], "My PDF")
self.assertEqual(result["author"], "John Doe")
self.assertEqual(result["subject"], "Testing")
self.assertEqual(result["creator"], "Word")
self.assertEqual(result["producer"], "Adobe")
self.assertEqual(result["creation_date"], "2023-06-15")
def test_get_info_with_none_metadata(self) -> None:
"""Test get_info when metadata is None."""
reader = make_mock_reader(num_pages=2, metadata=None)
result = self.opener.get_info(reader)
self.assertEqual(result["title"], "未知")
self.assertEqual(result["author"], "未知")
self.assertEqual(result["subject"], "未知")
self.assertEqual(result["creator"], "未知")
self.assertEqual(result["producer"], "未知")
self.assertEqual(result["creation_date"], "未知")
def test_get_info_with_partial_metadata_none_fields(self) -> None:
"""Test get_info when some metadata fields are None."""
metadata = make_mock_metadata(
title=None,
author="Known Author",
subject=None,
creator=None,
producer="Known Producer",
creation_date=None,
)
reader = make_mock_reader(num_pages=1, metadata=metadata)
result = self.opener.get_info(reader)
self.assertEqual(result["title"], "未知")
self.assertEqual(result["author"], "Known Author")
self.assertEqual(result["subject"], "未知")
self.assertEqual(result["creator"], "未知")
self.assertEqual(result["producer"], "Known Producer")
self.assertEqual(result["creation_date"], "未知")
def test_get_info_contains_all_keys(self) -> None:
"""Test that get_info result contains all expected keys."""
reader = make_mock_reader(num_pages=1)
result = self.opener.get_info(reader)
expected_keys = [
"num_pages", "title", "author", "subject",
"creator", "producer", "creation_date"
]
for key in expected_keys:
self.assertIn(key, result)
def test_get_info_single_page_pdf(self) -> None:
"""Test get_info with a single page PDF."""
reader = make_mock_reader(num_pages=1)
result = self.opener.get_info(reader)
self.assertEqual(result["num_pages"], 1)
## TestPDFOpenerDisplayInfo
class TestPDFOpenerDisplayInfo(unittest.TestCase):
"""Tests for PDFOpener.display_info method."""
def setUp(self) -> None:
"""Set up common test fixtures."""
self.opener = PDFOpener(file_path="test.pdf")
self.sample_info: dict = {
"num_pages": 10,
"title": "Sample Title",
"author": "Sample Author",
"subject": "Sample Subject",
"creator": "Sample Creator",
"producer": "Sample Producer",
"creation_date": "2024-01-01",
}
def test_display_info_prints_num_pages(self) -> None:
"""Test that display_info prints the number of pages."""
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
self.opener.display_info(self.sample_info)
output = mock_stdout.getvalue()
self.assertIn("10", output)
def test_display_info_prints_title(self) -> None:
"""Test that display_info prints the title."""
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
self.opener.display_info(self.sample_info)
output = mock_stdout.getvalue()
self.assertIn("Sample Title", output)
def test_display_info_prints_author(self) -> None:
"""Test that display_info prints the author."""
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
self.opener.display_info(self.sample_info)
output = mock_stdout.getvalue()
self.assertIn("Sample Author", output)
def test_display_info_prints_separator(self) -> None:
"""Test that display_info prints separator lines."""
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
self.opener.display_info(self.sample_info)
output = mock_stdout.getvalue()
self.assertIn("=" * 50, output)
def test_display_info_prints_header(self) -> None:
"""Test that display_info prints the header text."""
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
self.opener.display_info(self.sample_info)
output = mock_stdout.getvalue()
self.assertIn("PDF 文件基本信息", output)
def test_display_info_with_unknown_values(self) -> None:
"""Test display_info with unknown/missing values."""
info: dict = {
"num_pages": 1,
"title": "未知",
"author": "未知",
"subject": "未知",
"creator": "未知",
"producer": "未知",
"creation_date": "未知",
}
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
self.opener.display_info(info)
output = mock_stdout.getvalue()
self.assertIn("未知", output)
def test_display_info_with_empty_dict(self) -> None:
"""Test display_info with empty dict uses default '未知'."""
with patch("sys.stdout", new_callable=StringIO) as