• 技术文章 >后端开发 >Python教程

    怎么通过Python实现批量数据提取

    王林王林2023-04-29 21:16:05转载59

    配置需求

    1.ImageMagick

    2.tesseract-OCR

    3.Python3.7

    4.from PIL import Image as PI

    5.import io

    6.import os

    7.import pyocr.builders

    8.from cnocr import CnOcr

    9.import xlwt

    怎么通过Python实现批量数据提取

    分析上图发现票据金额为“贰拾万元整”,数据金额为大写中文,因此在导入Excel之前我们需要将金额票据的数据转换成数字的格式,基于此,我们需要首先完成大写汉字和数字的转换。

    def chineseNumber2Int(strNum: str):
        result = 0
        temp = 1  # 存放一个单位的数字如:十万
        count = 0  # 判断是否有chArr
        cnArr = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖']
        chArr = ['拾', '佰', '仟', '万', '亿']
        for i in range(len(strNum)):
            b = True
            c = strNum[i]
            for j in range(len(cnArr)):
                if c == cnArr[j]:
                    if count != 0:
                        result += temp
                        count = 0
                    temp = j + 1
                    b = False
                    break
            if b:
                for j in range(len(chArr)):
                    if c == chArr[j]:
                        if j == 0:
                            temp *= 10
                        elif j == 1:
                            temp *= 100
                        elif j == 2:
                            temp *= 1000
                        elif j == 3:
                            temp *= 10000
                        elif j == 4:
                            temp *= 100000000
                    count += 1
            if i == len(strNum) - 1:
                result += temp
        return result

    通过上述代码即可实现大写字母与数字的转换,例如输入“贰拾万元整”即可导出“200000”,再将其转换成数字后即可极大地简化表格的操作,也可以在完成表格操作的同时有利于数据归档。

    接下来,我们需要分析发票的内部内容,分析下图可知,我们需要获取以下几个数据内容:“出票日期”、“汇票到账日期”、“票据号码”、“收款人”、“票据金额”、“出票人”,可以通过画图软件获取精准定位。

    怎么通过Python实现批量数据提取

    如图,小黑点即鼠标所在地,画图软件左下角即他的坐标。

    怎么通过Python实现批量数据提取

    提取出票日期

    def text1(new_img):
        #提取出票日期
        left = 80
        top = 143
        right = 162
        bottom = 162
        image_text1 = new_img.crop((left, top, right, bottom))
        #展示图片
        #image_text1.show()
        txt1 = tool.image_to_string(image_text1)
        print(txt1)
        return str(txt1)

    提取金额

    def text2(new_img):
        #提取金额
        left = 224
        top = 355
        right = 585
        bottom = 380
        image_text2 = new_img.crop((left, top, right, bottom))
        #展示图片
        #image_text2.show()
        image_text2.save("img/tmp.png")
        temp = ocr.ocr("img/tmp.png")
        temp="".join(temp[0])
        txt2=chineseNumber2Int(temp)
        print(txt2)
        return txt2

    提取出票人

    def text3(new_img):
        #提取出票人
        left = 177
        top = 207
        right = 506
        bottom = 231
        image_text3 = new_img.crop((left, top, right, bottom))
        #展示图片
        #image_text3.show()
        image_text3.save("img/tmp.png")
        temp = ocr.ocr("img/tmp.png")
        txt3="".join(temp[0])
        print(txt3)
        return txt3

    提取付款行

    def text4(new_img):
        #提取付款行
        left = 177
        top = 274
        right = 492
        bottom = 311
        image_text4 = new_img.crop((left, top, right, bottom))
        #展示图片
        #image_text4.show()
        image_text4.save("img/tmp.png")
        temp = ocr.ocr("img/tmp.png")
        txt4="".join(temp[0])
        print(txt4)
        return txt4

    提取汇票到账日期

    def text5(new_img):
        #提取汇票到日期
        left = 92
        top = 166
        right = 176
        bottom = 184
        image_text5 = new_img.crop((left, top, right, bottom))
        #展示图片
        #image_text5.show()
        txt5 = tool.image_to_string(image_text5)
        print(txt5)
        return txt5

    提取票据单据

    def text6(new_img):
        #提取票据号码
        left = 598
        top = 166
        right = 870
        bottom = 182
        image_text6 = new_img.crop((left, top, right, bottom))
        #展示图片
        #image_text6.show()
        txt6 = tool.image_to_string(image_text6)
        print(txt6)
        return txt6

    在将数据全部提取完成之后,即进入设置环节,我们需要首先将所有账单文件进行提取,获取他们的文件名和路径。

    ocr=CnOcr()
    tool = pyocr.get_available_tools()[0]
    filePath='img'
    img_name=[]
    for i,j,name in os.walk(filePath):
        img_name=name

    在获取完整后,即可进行数据导入Excel的操作。

    count=1
    book = xlwt.Workbook(encoding='utf-8',style_compression=0)
    sheet = book.add_sheet('test',cell_overwrite_ok=True)
    for i in img_name:
        img_url = filePath+"/"+i
        with open(img_url, 'rb') as f:
            a = f.read()
        new_img = PI.open(io.BytesIO(a))
        ## 写入csv
        col = ('年份','出票日期','金额','出票人','付款行全称','汇票到日期','备注')
        for j in range(0,7):
            sheet.write(0,j,col[j])
        book.save('1.csv')
        shijian=text1(new_img)
        sheet.write(count,0,shijian[0:4])
        sheet.write(count,1,shijian[5:])
        sheet.write(count,2,text2(new_img))
        sheet.write(count,3,text3(new_img))
        sheet.write(count,4,text4(new_img))
        sheet.write(count,5,text5(new_img))
        sheet.write(count,6,text6(new_img))
        count = count + 1

    至此,完整流程结束。

    附上源码全部

    from  wand.image import  Image
    from PIL import Image as PI
    import pyocr
    import io
    import re
    import os
    import shutil
    import pyocr.builders
    from cnocr import CnOcr
    import requests
    import xlrd
    import xlwt
    from openpyxl import load_workbook
     
    def chineseNumber2Int(strNum: str):
        result = 0
        temp = 1  # 存放一个单位的数字如:十万
        count = 0  # 判断是否有chArr
        cnArr = ['壹', '贰', '叁', '肆', '伍', '陆', '柒', '捌', '玖']
        chArr = ['拾', '佰', '仟', '万', '亿']
        for i in range(len(strNum)):
            b = True
            c = strNum[i]
            for j in range(len(cnArr)):
                if c == cnArr[j]:
                    if count != 0:
                        result += temp
                        count = 0
                    temp = j + 1
                    b = False
                    break
            if b:
                for j in range(len(chArr)):
                    if c == chArr[j]:
                        if j == 0:
                            temp *= 10
                        elif j == 1:
                            temp *= 100
                        elif j == 2:
                            temp *= 1000
                        elif j == 3:
                            temp *= 10000
                        elif j == 4:
                            temp *= 100000000
                    count += 1
            if i == len(strNum) - 1:
                result += temp
        return result
     
     
    def text1(new_img):
        #提取出票日期
     
        left = 80
        top = 143
        right = 162
        bottom = 162
        image_text1 = new_img.crop((left, top, right, bottom))
        #展示图片
        #image_text1.show()
        txt1 = tool.image_to_string(image_text1)
     
        print(txt1)
        return str(txt1)
    def text2(new_img):
        #提取金额
     
        left = 224
        top = 355
        right = 585
        bottom = 380
        image_text2 = new_img.crop((left, top, right, bottom))
        #展示图片
        #image_text2.show()
        image_text2.save("img/tmp.png")
     
        temp = ocr.ocr("img/tmp.png")
     
        temp="".join(temp[0])
        txt2=chineseNumber2Int(temp)
        print(txt2)
     
        return txt2
     
    def text3(new_img):
        #提取出票人
     
        left = 177
        top = 207
        right = 506
        bottom = 231
        image_text3 = new_img.crop((left, top, right, bottom))
        #展示图片
        #image_text3.show()
        image_text3.save("img/tmp.png")
     
        temp = ocr.ocr("img/tmp.png")
        txt3="".join(temp[0])
     
        print(txt3)
        return txt3
    def text4(new_img):
        #提取付款行
     
        left = 177
        top = 274
        right = 492
        bottom = 311
        image_text4 = new_img.crop((left, top, right, bottom))
        #展示图片
        #image_text4.show()
        image_text4.save("img/tmp.png")
     
        temp = ocr.ocr("img/tmp.png")
        txt4="".join(temp[0])
     
        print(txt4)
        return txt4
    def text5(new_img):
        #提取汇票到日期
     
        left = 92
        top = 166
        right = 176
        bottom = 184
        image_text5 = new_img.crop((left, top, right, bottom))
        #展示图片
        #image_text5.show()
        txt5 = tool.image_to_string(image_text5)
     
        print(txt5)
        return txt5
    def text6(new_img):
        #提取票据号码
     
        left = 598
        top = 166
        right = 870
        bottom = 182
        image_text6 = new_img.crop((left, top, right, bottom))
        #展示图片
        #image_text6.show()
        txt6 = tool.image_to_string(image_text6)
     
        print(txt6)
        return txt6
     
     
     
    ocr=CnOcr()
     
    tool = pyocr.get_available_tools()[0]
     
    filePath='img'
    img_name=[]
    for i,j,name in os.walk(filePath):
        img_name=name
    count=1
     
    book = xlwt.Workbook(encoding='utf-8',style_compression=0)
    sheet = book.add_sheet('test',cell_overwrite_ok=True)
     
    for i in img_name:
        img_url = filePath+"/"+i
        with open(img_url, 'rb') as f:
            a = f.read()
        new_img = PI.open(io.BytesIO(a))
        ## 写入csv
        col = ('年份','出票日期','金额','出票人','付款行全称','汇票到日期','备注')
        for j in range(0,7):
            sheet.write(0,j,col[j])
        book.save('1.csv')
        shijian=text1(new_img)
        sheet.write(count,0,shijian[0:4])
        sheet.write(count,1,shijian[5:])
        sheet.write(count,2,text2(new_img))
        sheet.write(count,3,text3(new_img))
        sheet.write(count,4,text4(new_img))
        sheet.write(count,5,text5(new_img))
        sheet.write(count,6,text6(new_img))
        count = count + 1

    以上就是怎么通过Python实现批量数据提取的详细内容,更多请关注php中文网其它相关文章!

    声明:本文转载于:亿速云,如有侵犯,请联系admin@php.cn删除
    专题推荐:Python
    上一篇:Python之debug调试的方法是什么 下一篇:自己动手写 PHP MVC 框架(40节精讲/巨细/新人进阶必看)

    相关文章推荐

    • 怎么用Python展示全国高校的分布情况• python apscheduler cron定时任务触发接口自动化巡检怎么实现• Python中的Array模块怎么使用• python numpy中linspace函数怎么使用• 使用树状图可视化聚类
    1/1

    PHP中文网