#Project#基于python的PDF文本翻译实作
共 10391字,需浏览 21分钟
·
2024-05-15 22:30
“ 文章所涉及内容更多来自网络,在此声明,并感谢知识的贡献者!”
项目实现目标
—
项目实现目标:
1 实现PDF文件转成TXT文件
2 实现识别图片中的文字,并输出TXT文件
3 基于有道词典进行查词
项目所在环境
—
项目所在环境:
Operation system:Centos 7
Python Version: python 3.6.1
项目环境初始
—
项目环境初始:
Centos下的安装指令:
yum update
yum groupinstall “Development tools”
yum -y install automake autoconf libtool zlib-devel libjpeg-devel giflib libtiff-devel libwebp libwebp-devel libicu-devel openjpeg-devel cairo-devel
yum install gcc
pip3 install wand
pip3 install pytesseract
pip3 install pillow
pip3 install tesseract
wget https://github.com/tesseract-ocr/tesseract/archive/3.04.01.tar.gz
mv 3.04.01.tar.gz tesseract-3.04.01.tar.gz
tar xzvf tesseract-3.04.01.tar.gz
cd tesseract-3.04.01/
./autogen.sh
./configure
make
make install
ldconfig
pip3 install pyocr
yum install python-imaging
yum install ImageMagick-devel
export TESSDATA_PREFIX=/usr/local/share/tessdata
pip3 install pdfminer.six
pip3 install urllib
项目文件架构
—
项目文件架构:
项目Python源码
—
项目Python源码:
# -*- encoding: utf-8 -*-
import os
import io
import json
from wand.image import Image
from PIL import Image as PI
from PIL import ImageEnhance
import pyocr
import pyocr.builders
import re
import pytesseract as ocr
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.layout import *
import urllib.request
import urllib.parse
#获取指定文件夹下的所有文件名
def get_files(file_folder):
path = os.getcwd() + '/' + file_folder + '/'
files = []
for file_name in os.listdir(path):
files.append(path + file_name)
return files
#获取图片式PDF文件的内容
def pdf_image_to_string(pdf_file,lang_code=0):
# pyocr支持两种OCR库,由于我只安装了tesseract,只会获得tesseract
tool = pyocr.get_available_tools()[0]
# 选择要使用的语言,使用print tool.get_available_languages()列表
lang = tool.get_available_languages()[lang_code]
# 用来保存图像和对应的文字
req_image = []
final_text = []
# 打开pdf文件,并转为图像,替换./test.pdf
image_pdf = Image(filename=pdf_file, resolution=300)
image_jpeg = image_pdf.convert('jpeg')
# 把图片放到req_image中
for img in image_jpeg.sequence:
img_page = Image(image=img)
req_image.append(img_page.make_blob('jpeg'))
# 为每个图像运行OCR,识别图像中的文本
for img in req_image:
txt = tool.image_to_string(PI.open(io.BytesIO(img)), lang=lang,builder=pyocr.builders.TextBuilder())
final_text.append(txt)
return final_text
#获取可读PDF的文本内容
def pdf_text_to_string(pdf_file):
final_text=[]
#打开一个pdf文件
fp = open(pdf_file, 'rb')
#创建一个PDF文档解析器对象
parser = PDFParser(fp)
#创建一个PDF文档对象存储文档结构
#提供密码初始化,没有就不用传该参数
#document = PDFDocument(parser, password)
document = PDFDocument(parser)
#检查文件是否允许文本提取
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
#创建一个PDF资源管理器对象来存储共享资源
#caching = False不缓存
rsrcmgr = PDFResourceManager(caching = False)
# 创建一个PDF设备对象
laparams = LAParams()
# 创建一个PDF页面聚合对象
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
#创建一个PDF解析器对象
interpreter = PDFPageInterpreter(rsrcmgr, device)
#处理文档当中的每个页面
# doc.get_pages() 获取page列表
#for i, page in enumerate(document.get_pages()):
#PDFPage.create_pages(document) 获取page列表的另一种方式
replace=re.compile(r'\s+')
# 循环遍历列表,每次处理一个page的内容
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
# 接受该页面的LTPage对象
layout=device.get_result()
# 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
# 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
for x in layout:
#如果x是水平文本对象的话
if(isinstance(x,LTTextBoxHorizontal)):
text=re.sub(replace,'',x.get_text())
if len(text)!=0:
final_text.append(text)
return final_text
#使用pyocr 提取图片中的文字
def pyocr_image_to_string(img_file,lang_code=0):
tool = pyocr.get_available_tools()[0]
lang = tool.get_available_languages()[lang_code]
text = tool.image_to_string(PI.open(img_file), lang=lang,builder=pyocr.builders.TextBuilder())
return text
#使用pytesseract 提取图片中的文字
def pytess_imgae_to_string(img_file):
img = PI.open(img_file)
text=ocr.image_to_string(img, lang='chi_sim')
return text
#使用ImageEnhance增强提取图片中的文字
def enhance_image_to_string(img_file):
image = PI.open(img_file)
# 使用ImageEnhance可以增强图片的识别率
enhancer = ImageEnhance.Contrast(image)
image_enhancer = enhancer.enhance(4)
text = ocr.image_to_string(image_enhancer, lang='chi_sim')
return text
#模拟浏览器使用有道进行翻译
def youdao_html_translate(text,url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&sessionFrom="):
data = {}
data['i'] = text
data['from'] = 'AUTO'
data['to'] = 'AUTO'
data['smartresult']='dict'
data['client'] = 'fanyideskweb'
data['salt'] = '1503581407033'
data['sign'] = '67472a1b3638989677f7aca9af3be0aa'
data['doctype'] = 'json'
data['version'] = '2.1'
data['keyfrom'] = 'fanyi.web'
data['action'] = 'FY_BY_CLICKBUTTION'
data['typoResult'] = 'true'
data = urllib.parse.urlencode(data).encode('utf-8')
proxy_support = urllib.request.ProxyHandler({"https": "222.161.16.10:9999"})
opener = urllib.request.build_opener(proxy_support)
opener.addheaders = [('User-Agent',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36')]
urllib.request.install_opener(opener)
req = urllib.request.Request(url, data)
# req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36')
response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')
tar = json.loads(html)
#print("翻译结果是:%s" % tar['translateResult'][0][0]['tgt'])
return tar['translateResult'][0][0]['tgt']
#写入txt文本
def string_to_text(file,text):
try:
f = open(file,"w")
f.write(text)
finally:
if f:
f.close()
def translator(file_folder,file_type):
text=[]
#提取PDF的文字
if file_type == 'PDF':
pdf_files=get_files(file_folder)
text.append(pdf_image_to_string(pdf_files[0], lang_code=0))
text.append(pdf_text_to_string(pdf_files[1]))
# 提取图片的文字
if file_type == 'IMG':
img_files = get_files(file_folder)
text.append(pyocr_image_to_string(img_files[0]))
text.append(pytess_imgae_to_string(img_files[0]))
text.append(enhance_image_to_string(img_files[0]))
if len(text)==0:
text.append('翻译测试')
translated_text = youdao_html_translate(text=text[0])
string_to_text(file= os.getcwd() + '/txt_files/test.txt', text= translated_text)
print(translated_text)
def main():
translator(file_folder='empty_files',file_type='TEST')
项目参考资料
—
项目参考资料:
http://www.wisedream.net/2016/07/18/imgProcessing/ocr-with-pytesseract/
https://ivanzz1001.github.io/records/post/ocr/2017/09/08/tesseract-install
http://blog.topspeedsnail.com/archives/3571
http://blog.csdn.net/fighting_no1/article/details/51038942
http://blog.csdn.net/qq_21905401/article/details/77620561