## word转markdown word没法直接给AI,转MD后就可以处理,把协议转成数据库,直接把两天的工作量缩减到2分钟 ```bash pandoc document.docx -f docx -t markdown --wrap=none -s -o document.md ``` ## pdf转word mac ```bash brew update brew install ocrmypdf tesseract tesseract-lang qpdf ghostscript # 查看已装语言 tesseract --list-langs #对单个 PDF 执行 OCR(含中文简体) ocrmypdf --language chi_sim+eng ./in.pdf ./out_ocr.pdf --sidecar ./out.txt ``` ## pdf转md mac脚本内容如下: ```bash #!/usr/bin/env zsh # pdf-to-md.sh # 一键将 PDF 转为 Markdown 的脚本(包含依赖检查/安装、OCR、图片预处理、合并与清洗) # 目标输出:in_clean.md(与输入 PDF 同目录) # 用法: # ./pdf-to-md.sh /absolute/path/to/in.pdf # 如果不传入参数,脚本会尝试处理 ~/in.pdf set -euo pipefail IFS=$'\n\t' # --------- 配置(可修改) --------- DEFAULT_INPUT="$HOME/in.pdf" # 安装软件列表(Homebrew) BREW_PKGS=(poppler tesseract ghostscript pandoc imagemagick ocrmypdf) # tesseract 语言(需要确保 chi_sim 已安装) TESS_LANGS="chi_sim+eng" # 临时/输出目录名称 OUT_DIR_NAME="$(dirname "$DEFAULT_INPUT")" # ------------------------------------ # 传入的 PDF 路径(优先参数) INPUT_PDF="${1:-$DEFAULT_INPUT}" # 如果是相对路径,转换为绝对路径(保留已经是绝对路径的情况) if [[ ! "$INPUT_PDF" = /* ]]; then INPUT_PDF="$PWD/$INPUT_PDF" fi if [[ ! -f "$INPUT_PDF" ]]; then echo "ERROR: 找不到输入 PDF: $INPUT_PDF" echo "请指定正确路径,例如: $0 /absolute/path/to/in.pdf" exit 2 fi WORK_DIR="$(dirname "$INPUT_PDF")" # 统一去掉扩展名(不区分大小写) BASENAME="$(basename "$INPUT_PDF")" BASENAME="${BASENAME%.*}" IMAGES_DIR="$WORK_DIR/images" PRE_DIR="$WORK_DIR/images/pre_all" PRE2_DIR="$WORK_DIR/images/pre2_all" TMP_OCR_ALL="$WORK_DIR/in_images_ocr_best_all.txt" OUT_MD="$WORK_DIR/${BASENAME}.md" OCR_PDF="$WORK_DIR/${BASENAME}_ocr.pdf" SIDECAR="$WORK_DIR/${BASENAME}_sidecar.txt" mkdir -p "$IMAGES_DIR" "$PRE_DIR" "$PRE2_DIR" # Helper: check command command_exists() { command -v "$1" >/dev/null 2>&1 } echo "输入文件: $INPUT_PDF" echo "工作目录: $WORK_DIR" # 1) 安装依赖(Homebrew) if ! command_exists brew; then echo "Homebrew 未安装。请先安装 Homebrew: https://brew.sh/ ,然后重新运行本脚本。" exit 3 fi echo "检查并安装 Homebrew 包: ${BREW_PKGS[*]}" for pkg in "${BREW_PKGS[@]}"; do if brew list --formula | grep -q "^${pkg}$"; then echo "已安装: $pkg" else echo "安装: $pkg" brew install "$pkg" fi done # 确保 tesseract 有中文语言包(chi_sim),如果没有尝试安装 tesseract-lang 或提示用户 if ! tesseract --list-langs 2>/dev/null | grep -q "chi_sim"; then echo "tesseract 未安装中文语言包 chi_sim,尝试通过 brew 安装语言包..." if brew info tesseract-lang >/dev/null 2>&1; then brew install tesseract-lang || echo "请手动安装 tesseract 的 chi_sim 训练数据" else echo "无法通过 Homebrew 自动安装 tesseract 语言包,请参考 https://tesseract-ocr.github.io/tessdoc/ 如何安装 chi_sim.traineddata" fi fi # 2) 快速判断 PDF 是否已有文本层 echo "检查 PDF 是否包含文本层(尝试提取前5页)..." if command_exists pdftotext; then # 读取前5页文本(避免只检查封面页),然后去除所有空白检测实际字符 FIRST_PAGES_TEXT_RAW=$(pdftotext -l 5 -q "$INPUT_PDF" - 2>/dev/null || true) FIRST_PAGES_TEXT=$(echo "$FIRST_PAGES_TEXT_RAW" | tr -d '[:space:]' || true) else echo "pdftotext 未找到(poppler)。请确认已安装 poppler。" FIRST_PAGES_TEXT="" fi # 仅当前几页提取的内容包含非空白字符时,认为 PDF 有文本层 if [[ -n "$FIRST_PAGES_TEXT" ]]; then echo "检测到现有文本层,直接提取文本并转换为 Markdown..." TXT_OUT="$WORK_DIR/${BASENAME}.txt" pdftotext "$INPUT_PDF" "$TXT_OUT" # 如果 pdftotext 生成的文件仅为空白,也当作无文本层,继续走 OCR # 使用 grep -E 替代 -P (macOS兼容) if grep -Eq '[^[:space:]]' "$TXT_OUT" 2>/dev/null; then # 使用 Python 将纯文本合并为段落并输出为 Markdown(优化朗读连贯性) python3 - <]+>', '', merged) # 清理空格 merged = re.sub(r'\s+', ' ', merged) # 中文字符间不要空格 merged = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', merged) # 中文标点后不要空格 merged = re.sub(r'([,。!?;:、])\s+', r'\1', merged) # 中英文之间保持空格 merged = re.sub(r'([\u4e00-\u9fff])([a-zA-Z0-9])', r'\1 \2', merged) merged = re.sub(r'([a-zA-Z0-9])([\u4e00-\u9fff])', r'\1 \2', merged) # 数字和百分号之间不要空格 merged = re.sub(r'(\d)\s+([%%])', r'\1\2', merged) if merged: final_paras.append(merged) text = '\n\n'.join(final_paras) with open(outfile,'w',encoding='utf-8') as f: f.write(text) print('已生成:', outfile) PY exit 0 else echo "pdftotext 提取的文本为空白,继续使用 OCR 流程。" fi fi # 3) 无文本层 -> 使用 OCR echo "未检测到文本层,开始 OCR 流程(ocrmypdf)..." # 生成 OCR 后 PDF(覆盖输出) ocrmypdf --force -l "$TESS_LANGS" "$INPUT_PDF" "$OCR_PDF" --sidecar "$SIDECAR" || { echo "ocrmypdf 失败,请检查 tesseract/ghostscript 是否正确安装。" exit 4 } # 4) 提取图片 echo "提取 PDF 中的图片到 $IMAGES_DIR ..." pdfimages -all "$INPUT_PDF" "$IMAGES_DIR/img" || true # 5) 处理所有提取图片并做增强,然后逐张 OCR(psm 3 与 psm 6)选择最佳结果 rm -f "$TMP_OCR_ALL" for img in "$IMAGES_DIR"/*; do [[ -f "$img" ]] || continue ext="${img##*.}" base="$(basename "$img" .${ext})" out1="$PRE_DIR/${base}-pre.png" out2="$PRE2_DIR/${base}-pre2.png" echo "预处理: $img -> $out2" # 初步预处理 - 优化参数提高识别率 magick "$img" -density 300 -resize 200% -colorspace Gray \ -morphology close rectangle:1x1 \ -statistic Median 2x2 \ -deskew 40% \ -contrast-stretch 1%x1% \ -threshold 55% "$out1" # 进一步增强 - 改进锐化和对比度 magick "$out1" -resize 150% \ -contrast-stretch 0.5%x0.5% \ -unsharp 0x1+1+0.05 \ -level 0%,100%,1.2 "$out2" # OCR 两种模式 tesseract "$out2" /tmp/ocr_psm3 -l "$TESS_LANGS" --oem 1 --psm 3 >/dev/null 2>&1 || true tesseract "$out2" /tmp/ocr_psm6 -l "$TESS_LANGS" --oem 1 --psm 6 >/dev/null 2>&1 || true # 选择汉字数量更多的结果 python3 - <> "$TMP_OCR_ALL" import re p3='/tmp/ocr_psm3.txt' p6='/tmp/ocr_psm6.txt' try: s3=open(p3,'r',encoding='utf-8',errors='ignore').read() except: s3='' try: s6=open(p6,'r',encoding='utf-8',errors='ignore').read() except: s6='' han=re.compile(r'[\u4e00-\u9fff]') count3=len(han.findall(s3)) count6=len(han.findall(s6)) chosen = s3 if count3>=count6 else s6 print('===== SOURCE: %s =====' % ("$img")) print(chosen) print('===== END SOURCE: %s =====' % ("$img")) PY done # 6) 清洗合并文本 -> 生成 in_clean.md if [[ ! -s "$TMP_OCR_ALL" ]]; then if [[ -s "$SIDECAR" ]]; then echo "没有从图片生成 OCR 文本,使用 ocrmypdf sidecar: $SIDECAR -> 生成 Markdown" pandoc "$SIDECAR" -f plain -t gfm -o "$OUT_MD" && echo "已生成: $OUT_MD" || echo "pandoc 转换 sidecar 失败" exit 0 else echo "ERROR: 未从图片生成 OCR 文本,且未找到 sidecar ($SIDECAR)。无法继续。" exit 5 fi fi python3 - < 开始新段落 if last_char in end_punc: # 但如果下一行以小写字母或中文开头,可能是句子继续 if first_char.isupper() or ch_re.match(first_char): # 可能是新段落,但也可能是继续,保守合并 if len(cur) > 50: # 如果当前段落已经足够长,分段 paras.append(cur.strip()) cur=ln continue # 情况2:都是中文 -> 直接连接(句子被截断) if ch_re.search(last_char) and ch_re.search(first_char): cur = cur + ln # 情况3:前一行以标点结尾,当前行以中文开头 -> 直接连接 elif last_char in end_punc and ch_re.match(first_char): cur = cur + ln # 情况4:其他情况用空格连接 else: cur = cur + ' ' + ln if cur: paras.append(cur.strip()) # 第三步:文本后处理 - 清理格式 processed_paras = [] for para in paras: # 移除HTML/XML标签(OCR误识别) para = re.sub(r'<[^>]+>', '', para) # 移除多余空格 para = re.sub(r'\s+', ' ', para) # 中文字符之间不应有空格(被截断的句子) para = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', para) # 中文标点后不需要空格 para = re.sub(r'([,。!?;:、])\s+', r'\1', para) # 英文标点和中文之间的空格 para = re.sub(r'([.!?;:])\s+([\u4e00-\u9fff])', r'\1\2', para) # 中英文之间保持单个空格 para = re.sub(r'([\u4e00-\u9fff])([a-zA-Z0-9])', r'\1 \2', para) para = re.sub(r'([a-zA-Z0-9])([\u4e00-\u9fff])', r'\1 \2', para) # 数字和单位之间处理 para = re.sub(r'(\d)\s+([%%])', r'\1\2', para) # 冒号后中文之间的处理 para = re.sub(r':([\u4e00-\u9fff])', r': \1', para) if para: processed_paras.append(para) text='\\n\\n'.join(processed_paras) with open(outfile,'w',encoding='utf-8') as f: f.write(text) print('WROTE', outfile) PY ``` 使用方式: ```bash # 给脚本可执行权限并运行(处理默认文件) chmod +x ./pdf-to-md.sh # 或指定任意 PDF ./pdf-to-md.sh path/to/your.pdf ``` ## xlsx 转 csv 直接导出为csv时发现如果单元格内容过多会被截断,所以写了一个这样的脚本。 `convert_xlsx_to_csv.py` ```python #!/usr/bin/env python3 """ convert_xlsx_to_csv.py A small CLI to convert Excel (.xlsx) files to CSV text files. Usage examples: python convert_xlsx_to_csv.py "单三相事件(2)(1).xlsx" python convert_xlsx_to_csv.py -i input.xlsx -o output.csv -s Sheet1 -d "," -e utf-8-sig The script tries to use pandas if available (recommended). If pandas isn't installed, it falls back to openpyxl for basic row-by-row export. """ import argparse import os import sys import csv def parse_args(): p = argparse.ArgumentParser(description="Convert .xlsx to .csv text file") p.add_argument('-i', '--input', required=True, help='Path to input .xlsx file') p.add_argument('-o', '--output', help='Path to output .csv file. If omitted, derived from input.') p.add_argument('-s', '--sheet', help='Sheet name or zero-based index to export. Default: active sheet') p.add_argument('-d', '--delim', default=',', help='CSV delimiter (default: ",")') p.add_argument('-e', '--encoding', default='utf-8-sig', help='Output file encoding (default: utf-8-sig)') return p.parse_args() def write_csv_rows(path, rows, delim, encoding): with open(path, 'w', newline='', encoding=encoding) as f: writer = csv.writer(f, delimiter=delim) for r in rows: # Normalize values: None -> empty string writer.writerow([("" if v is None else v) for v in r]) def export_with_pandas(input_path, output_path, sheet, delim, encoding): import pandas as pd # If sheet is omitted, read the first sheet if sheet is None: df = pd.read_excel(input_path, engine='openpyxl') df.to_csv(output_path, index=False, sep=delim, encoding=encoding) print(f'Wrote: {output_path}') else: # Try to interpret sheet as an int index try: idx = int(sheet) # pandas accepts sheet_name as int df = pd.read_excel(input_path, sheet_name=idx, engine='openpyxl') df.to_csv(output_path, index=False, sep=delim, encoding=encoding) print(f'Wrote sheet index {idx} -> {output_path}') except ValueError: # sheet is a name df = pd.read_excel(input_path, sheet_name=sheet, engine='openpyxl') df.to_csv(output_path, index=False, sep=delim, encoding=encoding) print(f'Wrote sheet "{sheet}" -> {output_path}') def export_with_openpyxl(input_path, output_path, sheet, delim, encoding): try: from openpyxl import load_workbook except Exception as e: print('Missing dependency: please install "pandas" or at least "openpyxl".') print('Install with: pip install pandas openpyxl') raise wb = load_workbook(filename=input_path, read_only=True, data_only=True) if sheet is None: ws = wb.active else: # sheet may be index or name if sheet.isdigit(): idx = int(sheet) ws = wb.worksheets[idx] else: if sheet in wb.sheetnames: ws = wb[sheet] else: raise ValueError(f'Sheet "{sheet}" not found in workbook') rows = (tuple(cell if cell is not None else '' for cell in r) for r in ws.iter_rows(values_only=True)) write_csv_rows(output_path, rows, delim, encoding) print(f'Wrote: {output_path}') def main(): args = parse_args() input_path = args.input if not os.path.isfile(input_path): print(f'Input file not found: {input_path}') sys.exit(2) if args.output: output_path = args.output else: base, _ = os.path.splitext(input_path) # If sheet specified and not default, append sheet name if args.sheet: safe_sheet = args.sheet.replace(' ', '_') output_path = f"{base}_{safe_sheet}.csv" else: output_path = f"{base}.csv" # Try pandas first try: import pandas # noqa: F401 use_pandas = True except Exception: use_pandas = False try: if use_pandas: export_with_pandas(input_path, output_path, args.sheet, args.delim, args.encoding) else: export_with_openpyxl(input_path, output_path, args.sheet or None, args.delim, args.encoding) except Exception as e: print('Error during export:', e) sys.exit(1) if __name__ == '__main__': main() ``` `requirements.txt` ``` pandas openpyxl ``` 怎么用? ``` # (可选)安装依赖 pip install -r requirements.txt # 指定输出文件名与 sheet python convert_xlsx_to_csv.py -i input.xlsx -o output.csv -s Sheet1 # 指定分隔符(例如分号)与编码 python convert_xlsx_to_csv.py -i input.xlsx -o output.csv -d ";" -e utf-8 说明 - 脚本优先尝试使用 `pandas`(功能更强,自动处理表头类型等);如果没有安装 `pandas`,脚本会回退到 `openpyxl` 的逐行导出。 - 默认输出编码为 `utf-8-sig`,方便在 Excel 中直接双击打开时正确识别中文。 ``` Loading... ## word转markdown word没法直接给AI,转MD后就可以处理,把协议转成数据库,直接把两天的工作量缩减到2分钟 ```bash pandoc document.docx -f docx -t markdown --wrap=none -s -o document.md ``` ## pdf转word mac ```bash brew update brew install ocrmypdf tesseract tesseract-lang qpdf ghostscript # 查看已装语言 tesseract --list-langs #对单个 PDF 执行 OCR(含中文简体) ocrmypdf --language chi_sim+eng ./in.pdf ./out_ocr.pdf --sidecar ./out.txt ``` ## pdf转md mac脚本内容如下: ```bash #!/usr/bin/env zsh # pdf-to-md.sh # 一键将 PDF 转为 Markdown 的脚本(包含依赖检查/安装、OCR、图片预处理、合并与清洗) # 目标输出:in_clean.md(与输入 PDF 同目录) # 用法: # ./pdf-to-md.sh /absolute/path/to/in.pdf # 如果不传入参数,脚本会尝试处理 ~/in.pdf set -euo pipefail IFS=$'\n\t' # --------- 配置(可修改) --------- DEFAULT_INPUT="$HOME/in.pdf" # 安装软件列表(Homebrew) BREW_PKGS=(poppler tesseract ghostscript pandoc imagemagick ocrmypdf) # tesseract 语言(需要确保 chi_sim 已安装) TESS_LANGS="chi_sim+eng" # 临时/输出目录名称 OUT_DIR_NAME="$(dirname "$DEFAULT_INPUT")" # ------------------------------------ # 传入的 PDF 路径(优先参数) INPUT_PDF="${1:-$DEFAULT_INPUT}" # 如果是相对路径,转换为绝对路径(保留已经是绝对路径的情况) if [[ ! "$INPUT_PDF" = /* ]]; then INPUT_PDF="$PWD/$INPUT_PDF" fi if [[ ! -f "$INPUT_PDF" ]]; then echo "ERROR: 找不到输入 PDF: $INPUT_PDF" echo "请指定正确路径,例如: $0 /absolute/path/to/in.pdf" exit 2 fi WORK_DIR="$(dirname "$INPUT_PDF")" # 统一去掉扩展名(不区分大小写) BASENAME="$(basename "$INPUT_PDF")" BASENAME="${BASENAME%.*}" IMAGES_DIR="$WORK_DIR/images" PRE_DIR="$WORK_DIR/images/pre_all" PRE2_DIR="$WORK_DIR/images/pre2_all" TMP_OCR_ALL="$WORK_DIR/in_images_ocr_best_all.txt" OUT_MD="$WORK_DIR/${BASENAME}.md" OCR_PDF="$WORK_DIR/${BASENAME}_ocr.pdf" SIDECAR="$WORK_DIR/${BASENAME}_sidecar.txt" mkdir -p "$IMAGES_DIR" "$PRE_DIR" "$PRE2_DIR" # Helper: check command command_exists() { command -v "$1" >/dev/null 2>&1 } echo "输入文件: $INPUT_PDF" echo "工作目录: $WORK_DIR" # 1) 安装依赖(Homebrew) if ! command_exists brew; then echo "Homebrew 未安装。请先安装 Homebrew: https://brew.sh/ ,然后重新运行本脚本。" exit 3 fi echo "检查并安装 Homebrew 包: ${BREW_PKGS[*]}" for pkg in "${BREW_PKGS[@]}"; do if brew list --formula | grep -q "^${pkg}$"; then echo "已安装: $pkg" else echo "安装: $pkg" brew install "$pkg" fi done # 确保 tesseract 有中文语言包(chi_sim),如果没有尝试安装 tesseract-lang 或提示用户 if ! tesseract --list-langs 2>/dev/null | grep -q "chi_sim"; then echo "tesseract 未安装中文语言包 chi_sim,尝试通过 brew 安装语言包..." if brew info tesseract-lang >/dev/null 2>&1; then brew install tesseract-lang || echo "请手动安装 tesseract 的 chi_sim 训练数据" else echo "无法通过 Homebrew 自动安装 tesseract 语言包,请参考 https://tesseract-ocr.github.io/tessdoc/ 如何安装 chi_sim.traineddata" fi fi # 2) 快速判断 PDF 是否已有文本层 echo "检查 PDF 是否包含文本层(尝试提取前5页)..." if command_exists pdftotext; then # 读取前5页文本(避免只检查封面页),然后去除所有空白检测实际字符 FIRST_PAGES_TEXT_RAW=$(pdftotext -l 5 -q "$INPUT_PDF" - 2>/dev/null || true) FIRST_PAGES_TEXT=$(echo "$FIRST_PAGES_TEXT_RAW" | tr -d '[:space:]' || true) else echo "pdftotext 未找到(poppler)。请确认已安装 poppler。" FIRST_PAGES_TEXT="" fi # 仅当前几页提取的内容包含非空白字符时,认为 PDF 有文本层 if [[ -n "$FIRST_PAGES_TEXT" ]]; then echo "检测到现有文本层,直接提取文本并转换为 Markdown..." TXT_OUT="$WORK_DIR/${BASENAME}.txt" pdftotext "$INPUT_PDF" "$TXT_OUT" # 如果 pdftotext 生成的文件仅为空白,也当作无文本层,继续走 OCR # 使用 grep -E 替代 -P (macOS兼容) if grep -Eq '[^[:space:]]' "$TXT_OUT" 2>/dev/null; then # 使用 Python 将纯文本合并为段落并输出为 Markdown(优化朗读连贯性) python3 - <<PY import re, sys infile = r"$TXT_OUT" outfile = r"$OUT_MD" try: with open(infile,'r',encoding='utf-8',errors='ignore') as f: lines = [ln.rstrip() for ln in f] except Exception as e: print('读取文本失败:', e, file=sys.stderr) sys.exit(1) # 智能段落合并:识别真正的段落边界,移除句子内部的不必要换行 paras = [] cur_lines = [] end_punc = set('。!?;:.!?;:') for l in lines: stripped = l.strip() if not stripped: # 空行:判断是否真的是段落分隔 if cur_lines: # 检查最后一行是否以句号结尾 last_line = cur_lines[-1].strip() if last_line and last_line[-1] in end_punc: # 是完整段落,保存 paras.append(cur_lines) cur_lines = [] # 否则可能是段落内的空行,忽略 continue cur_lines.append(stripped) # 处理最后一个段落 if cur_lines: paras.append(cur_lines) # 合并每个段落内的行,智能处理中英文 final_paras = [] for para_lines in paras: if not para_lines: continue # 逐行合并,判断是否需要空格 merged = '' for line in para_lines: if not merged: merged = line continue # 检查前一行最后一个字符和当前行第一个字符 last_char = merged[-1] if merged else '' first_char = line[0] if line else '' # 如果都是中文字符,直接连接(句子被截断的情况) if re.match(r'[\u4e00-\u9fff]', last_char) and re.match(r'[\u4e00-\u9fff]', first_char): merged += line # 如果前一行以标点结尾,当前行以中文开头,直接连接 elif last_char in end_punc and re.match(r'[\u4e00-\u9fff]', first_char): merged += line else: # 其他情况用空格连接 merged += ' ' + line # 移除HTML/XML标签 merged = re.sub(r'<[^>]+>', '', merged) # 清理空格 merged = re.sub(r'\s+', ' ', merged) # 中文字符间不要空格 merged = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', merged) # 中文标点后不要空格 merged = re.sub(r'([,。!?;:、])\s+', r'\1', merged) # 中英文之间保持空格 merged = re.sub(r'([\u4e00-\u9fff])([a-zA-Z0-9])', r'\1 \2', merged) merged = re.sub(r'([a-zA-Z0-9])([\u4e00-\u9fff])', r'\1 \2', merged) # 数字和百分号之间不要空格 merged = re.sub(r'(\d)\s+([%%])', r'\1\2', merged) if merged: final_paras.append(merged) text = '\n\n'.join(final_paras) with open(outfile,'w',encoding='utf-8') as f: f.write(text) print('已生成:', outfile) PY exit 0 else echo "pdftotext 提取的文本为空白,继续使用 OCR 流程。" fi fi # 3) 无文本层 -> 使用 OCR echo "未检测到文本层,开始 OCR 流程(ocrmypdf)..." # 生成 OCR 后 PDF(覆盖输出) ocrmypdf --force -l "$TESS_LANGS" "$INPUT_PDF" "$OCR_PDF" --sidecar "$SIDECAR" || { echo "ocrmypdf 失败,请检查 tesseract/ghostscript 是否正确安装。" exit 4 } # 4) 提取图片 echo "提取 PDF 中的图片到 $IMAGES_DIR ..." pdfimages -all "$INPUT_PDF" "$IMAGES_DIR/img" || true # 5) 处理所有提取图片并做增强,然后逐张 OCR(psm 3 与 psm 6)选择最佳结果 rm -f "$TMP_OCR_ALL" for img in "$IMAGES_DIR"/*; do [[ -f "$img" ]] || continue ext="${img##*.}" base="$(basename "$img" .${ext})" out1="$PRE_DIR/${base}-pre.png" out2="$PRE2_DIR/${base}-pre2.png" echo "预处理: $img -> $out2" # 初步预处理 - 优化参数提高识别率 magick "$img" -density 300 -resize 200% -colorspace Gray \ -morphology close rectangle:1x1 \ -statistic Median 2x2 \ -deskew 40% \ -contrast-stretch 1%x1% \ -threshold 55% "$out1" # 进一步增强 - 改进锐化和对比度 magick "$out1" -resize 150% \ -contrast-stretch 0.5%x0.5% \ -unsharp 0x1+1+0.05 \ -level 0%,100%,1.2 "$out2" # OCR 两种模式 tesseract "$out2" /tmp/ocr_psm3 -l "$TESS_LANGS" --oem 1 --psm 3 >/dev/null 2>&1 || true tesseract "$out2" /tmp/ocr_psm6 -l "$TESS_LANGS" --oem 1 --psm 6 >/dev/null 2>&1 || true # 选择汉字数量更多的结果 python3 - <<PY >> "$TMP_OCR_ALL" import re p3='/tmp/ocr_psm3.txt' p6='/tmp/ocr_psm6.txt' try: s3=open(p3,'r',encoding='utf-8',errors='ignore').read() except: s3='' try: s6=open(p6,'r',encoding='utf-8',errors='ignore').read() except: s6='' han=re.compile(r'[\u4e00-\u9fff]') count3=len(han.findall(s3)) count6=len(han.findall(s6)) chosen = s3 if count3>=count6 else s6 print('===== SOURCE: %s =====' % ("$img")) print(chosen) print('===== END SOURCE: %s =====' % ("$img")) PY done # 6) 清洗合并文本 -> 生成 in_clean.md if [[ ! -s "$TMP_OCR_ALL" ]]; then if [[ -s "$SIDECAR" ]]; then echo "没有从图片生成 OCR 文本,使用 ocrmypdf sidecar: $SIDECAR -> 生成 Markdown" pandoc "$SIDECAR" -f plain -t gfm -o "$OUT_MD" && echo "已生成: $OUT_MD" || echo "pandoc 转换 sidecar 失败" exit 0 else echo "ERROR: 未从图片生成 OCR 文本,且未找到 sidecar ($SIDECAR)。无法继续。" exit 5 fi fi python3 - <<PY import re infile = r'''$TMP_OCR_ALL''' outfile = r'''$OUT_MD''' with open(infile,'r',encoding='utf-8',errors='ignore') as f: lines=f.readlines() # 第一步:清理噪声行 cleaned=[] for ln in lines: s=ln.strip() if not s: cleaned.append('') continue # 跳过各种噪声 if any(s.startswith(x) for x in ['processed ', 'Image too small', 'zsh:', '(ocr failed']): continue if s.startswith('====='): cleaned.append('') continue # 过滤掉过短的行(可能是噪声) if len(s) < 2 and not re.search(r'[\u4e00-\u9fff]', s): continue cleaned.append(s) # 第二步:智能段落合并 - 优化连贯性 end_punc=set('。!?;:.!?;:') ch_re=re.compile(r'[\u4e00-\u9fff]') paras=[] cur='' for ln in cleaned: if ln=='': # 空行:检查当前段落是否完整 if cur: last_char = cur.strip()[-1] if cur.strip() else '' # 只有在句子完整时才分段 if last_char in end_punc: paras.append(cur.strip()) cur='' # 否则忽略空行,继续当前段落 continue if not cur: cur=ln continue # 检查行尾行首字符,判断是否是句子被截断 last_char = cur[-1] if cur else '' first_char = ln[0] if ln else '' # 情况1:当前段落以完整句子结尾 -> 开始新段落 if last_char in end_punc: # 但如果下一行以小写字母或中文开头,可能是句子继续 if first_char.isupper() or ch_re.match(first_char): # 可能是新段落,但也可能是继续,保守合并 if len(cur) > 50: # 如果当前段落已经足够长,分段 paras.append(cur.strip()) cur=ln continue # 情况2:都是中文 -> 直接连接(句子被截断) if ch_re.search(last_char) and ch_re.search(first_char): cur = cur + ln # 情况3:前一行以标点结尾,当前行以中文开头 -> 直接连接 elif last_char in end_punc and ch_re.match(first_char): cur = cur + ln # 情况4:其他情况用空格连接 else: cur = cur + ' ' + ln if cur: paras.append(cur.strip()) # 第三步:文本后处理 - 清理格式 processed_paras = [] for para in paras: # 移除HTML/XML标签(OCR误识别) para = re.sub(r'<[^>]+>', '', para) # 移除多余空格 para = re.sub(r'\s+', ' ', para) # 中文字符之间不应有空格(被截断的句子) para = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', para) # 中文标点后不需要空格 para = re.sub(r'([,。!?;:、])\s+', r'\1', para) # 英文标点和中文之间的空格 para = re.sub(r'([.!?;:])\s+([\u4e00-\u9fff])', r'\1\2', para) # 中英文之间保持单个空格 para = re.sub(r'([\u4e00-\u9fff])([a-zA-Z0-9])', r'\1 \2', para) para = re.sub(r'([a-zA-Z0-9])([\u4e00-\u9fff])', r'\1 \2', para) # 数字和单位之间处理 para = re.sub(r'(\d)\s+([%%])', r'\1\2', para) # 冒号后中文之间的处理 para = re.sub(r':([\u4e00-\u9fff])', r': \1', para) if para: processed_paras.append(para) text='\\n\\n'.join(processed_paras) with open(outfile,'w',encoding='utf-8') as f: f.write(text) print('WROTE', outfile) PY ``` 使用方式: ```bash # 给脚本可执行权限并运行(处理默认文件) chmod +x ./pdf-to-md.sh # 或指定任意 PDF ./pdf-to-md.sh path/to/your.pdf ``` ## xlsx 转 csv 直接导出为csv时发现如果单元格内容过多会被截断,所以写了一个这样的脚本。 `convert_xlsx_to_csv.py` ```python #!/usr/bin/env python3 """ convert_xlsx_to_csv.py A small CLI to convert Excel (.xlsx) files to CSV text files. Usage examples: python convert_xlsx_to_csv.py "单三相事件(2)(1).xlsx" python convert_xlsx_to_csv.py -i input.xlsx -o output.csv -s Sheet1 -d "," -e utf-8-sig The script tries to use pandas if available (recommended). If pandas isn't installed, it falls back to openpyxl for basic row-by-row export. """ import argparse import os import sys import csv def parse_args(): p = argparse.ArgumentParser(description="Convert .xlsx to .csv text file") p.add_argument('-i', '--input', required=True, help='Path to input .xlsx file') p.add_argument('-o', '--output', help='Path to output .csv file. If omitted, derived from input.') p.add_argument('-s', '--sheet', help='Sheet name or zero-based index to export. Default: active sheet') p.add_argument('-d', '--delim', default=',', help='CSV delimiter (default: ",")') p.add_argument('-e', '--encoding', default='utf-8-sig', help='Output file encoding (default: utf-8-sig)') return p.parse_args() def write_csv_rows(path, rows, delim, encoding): with open(path, 'w', newline='', encoding=encoding) as f: writer = csv.writer(f, delimiter=delim) for r in rows: # Normalize values: None -> empty string writer.writerow([("" if v is None else v) for v in r]) def export_with_pandas(input_path, output_path, sheet, delim, encoding): import pandas as pd # If sheet is omitted, read the first sheet if sheet is None: df = pd.read_excel(input_path, engine='openpyxl') df.to_csv(output_path, index=False, sep=delim, encoding=encoding) print(f'Wrote: {output_path}') else: # Try to interpret sheet as an int index try: idx = int(sheet) # pandas accepts sheet_name as int df = pd.read_excel(input_path, sheet_name=idx, engine='openpyxl') df.to_csv(output_path, index=False, sep=delim, encoding=encoding) print(f'Wrote sheet index {idx} -> {output_path}') except ValueError: # sheet is a name df = pd.read_excel(input_path, sheet_name=sheet, engine='openpyxl') df.to_csv(output_path, index=False, sep=delim, encoding=encoding) print(f'Wrote sheet "{sheet}" -> {output_path}') def export_with_openpyxl(input_path, output_path, sheet, delim, encoding): try: from openpyxl import load_workbook except Exception as e: print('Missing dependency: please install "pandas" or at least "openpyxl".') print('Install with: pip install pandas openpyxl') raise wb = load_workbook(filename=input_path, read_only=True, data_only=True) if sheet is None: ws = wb.active else: # sheet may be index or name if sheet.isdigit(): idx = int(sheet) ws = wb.worksheets[idx] else: if sheet in wb.sheetnames: ws = wb[sheet] else: raise ValueError(f'Sheet "{sheet}" not found in workbook') rows = (tuple(cell if cell is not None else '' for cell in r) for r in ws.iter_rows(values_only=True)) write_csv_rows(output_path, rows, delim, encoding) print(f'Wrote: {output_path}') def main(): args = parse_args() input_path = args.input if not os.path.isfile(input_path): print(f'Input file not found: {input_path}') sys.exit(2) if args.output: output_path = args.output else: base, _ = os.path.splitext(input_path) # If sheet specified and not default, append sheet name if args.sheet: safe_sheet = args.sheet.replace(' ', '_') output_path = f"{base}_{safe_sheet}.csv" else: output_path = f"{base}.csv" # Try pandas first try: import pandas # noqa: F401 use_pandas = True except Exception: use_pandas = False try: if use_pandas: export_with_pandas(input_path, output_path, args.sheet, args.delim, args.encoding) else: export_with_openpyxl(input_path, output_path, args.sheet or None, args.delim, args.encoding) except Exception as e: print('Error during export:', e) sys.exit(1) if __name__ == '__main__': main() ``` `requirements.txt` ``` pandas openpyxl ``` 怎么用? ``` # (可选)安装依赖 pip install -r requirements.txt # 指定输出文件名与 sheet python convert_xlsx_to_csv.py -i input.xlsx -o output.csv -s Sheet1 # 指定分隔符(例如分号)与编码 python convert_xlsx_to_csv.py -i input.xlsx -o output.csv -d ";" -e utf-8 说明 - 脚本优先尝试使用 `pandas`(功能更强,自动处理表头类型等);如果没有安装 `pandas`,脚本会回退到 `openpyxl` 的逐行导出。 - 默认输出编码为 `utf-8-sig`,方便在 Excel 中直接双击打开时正确识别中文。 ``` 最后修改:2026 年 01 月 12 日 © 允许规范转载 赞 1 别打赏,我怕忍不住购买辣条与续命水