-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcli.py
More file actions
178 lines (143 loc) · 5.91 KB
/
cli.py
File metadata and controls
178 lines (143 loc) · 5.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/usr/bin/env python3
"""
docs2md CLI — 将文档/图片转为 Markdown 或纯文本
用法:
python cli.py <输入文件或目录> [选项]
选项:
-o, --output PATH 输出文件或目录(默认:<输入>.md 或 <输入>_out/)
-f, --format 输出格式:md(默认)或 txt
-q, --quiet 只输出错误,不打印进度
支持格式:
文档 .docx .doc .xlsx .xls .pdf .txt
图片 .png .jpg .jpeg .gif .webp .bmp .tiff .tif
"""
import argparse
import asyncio
import sys
from pathlib import Path
# ── 进度回调(打印到终端)──────────────────────────────────────────────────
def make_sse_callback(quiet: bool):
async def _cb(data: dict):
if quiet:
return
t = data.get("type", "")
msg = data.get("content", "")
if t == "debug":
print(f" {msg}")
elif t == "error":
print(f" [错误] {msg}", file=sys.stderr)
return _cb
# ── 单文件转换 ─────────────────────────────────────────────────────────────
async def convert_one(
input_path: Path,
output_dir: Path,
fmt: str,
quiet: bool,
) -> bool:
from backend.converters.docx_converter import convert_docx
from backend.converters.excel_converter import convert_excel
from backend.converters.pdf_converter import convert_pdf
from backend.converters.txt_converter import convert_txt
from backend.converters.image_converter import convert_image, IMAGE_EXTENSIONS
cb = make_sse_callback(quiet)
ext = input_path.suffix.lower()
if ext in (".docx", ".doc"):
r = await convert_docx(input_path, output_dir, fmt, sse_callback=cb)
elif ext in (".xlsx", ".xls"):
r = await convert_excel(input_path, output_dir, fmt, sse_callback=cb)
elif ext == ".pdf":
r = await convert_pdf(input_path, output_dir, fmt, sse_callback=cb)
elif ext == ".txt":
r = await convert_txt(input_path, output_dir, fmt, sse_callback=cb)
elif ext in IMAGE_EXTENSIONS:
r = await convert_image(input_path, output_dir, fmt, sse_callback=cb)
else:
print(f"[跳过] 不支持的格式:{input_path.name}", file=sys.stderr)
return False
if r.get("error"):
print(f"[失败] {input_path.name}:{r['error']}", file=sys.stderr)
return False
out = r.get("path", "")
if not quiet:
print(f" -> {out}")
return True
# ── 目录批量转换 ───────────────────────────────────────────────────────────
async def convert_dir(
input_dir: Path,
output_dir: Path,
fmt: str,
quiet: bool,
) -> None:
from backend.utils.traversal import traverse_and_convert
cb = make_sse_callback(quiet)
results = await traverse_and_convert(input_dir, output_dir, fmt, sse_callback=cb)
ok = sum(1 for r in results if not r.get("error") and r.get("path") != "index")
fail = sum(1 for r in results if r.get("error"))
index_entry = next((r for r in results if r.get("path") == "index"), None)
print(f"\n完成:{ok} 个成功,{fail} 个失败。")
if index_entry:
print(f"索引:{index_entry.get('output', '')}")
if fail:
print("\n失败列表:", file=sys.stderr)
for r in results:
if r.get("error"):
print(f" {r['path']}:{r['error']}", file=sys.stderr)
# ── 入口 ──────────────────────────────────────────────────────────────────
def build_parser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
prog="docs2md",
description="将文档/图片转为 Markdown 或纯文本",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""示例:
python cli.py report.pdf
python cli.py report.pdf -o output/report.md
python cli.py ./docs/ -o ./output/ -f txt
python cli.py photo.png --quiet""",
)
p.add_argument("input", metavar="INPUT", help="输入文件或目录")
p.add_argument("-o", "--output", metavar="PATH", help="输出文件或目录")
p.add_argument(
"-f", "--format",
choices=["md", "txt"],
default="md",
help="输出格式,默认 md",
)
p.add_argument("-q", "--quiet", action="store_true", help="只输出错误信息")
return p
def main() -> None:
parser = build_parser()
args = parser.parse_args()
inp = Path(args.input).resolve()
if not inp.exists():
print(f"错误:路径不存在:{inp}", file=sys.stderr)
sys.exit(1)
fmt: str = args.format
quiet: bool = args.quiet
if inp.is_dir():
# 目录模式
if args.output:
out = Path(args.output).resolve()
else:
out = inp.parent / (inp.name + "_out")
out.mkdir(parents=True, exist_ok=True)
if not quiet:
print(f"输入目录:{inp}")
print(f"输出目录:{out}")
print(f"输出格式:{fmt}\n")
asyncio.run(convert_dir(inp, out, fmt, quiet))
else:
# 单文件模式
if args.output:
out_path = Path(args.output).resolve()
output_dir = out_path.parent
output_dir.mkdir(parents=True, exist_ok=True)
else:
output_dir = inp.parent
if not quiet:
print(f"输入文件:{inp}")
print(f"输出目录:{output_dir}")
print(f"输出格式:{fmt}\n")
ok = asyncio.run(convert_one(inp, output_dir, fmt, quiet))
sys.exit(0 if ok else 1)
if __name__ == "__main__":
main()