发票验真是一个重复性强、没有任何技巧的工作,但是非常繁琐。如果公司规模大一点,一个月有1000张发票需要验证的话,那么起码要花费5人/天时间。
基于上述情况我就开发了一个python程序。首先python在图像识别方面有做得比较好的库,我们可以直接拿过来使用,不要去重复造轮子。
图像识别我采用了百度的飞浆paddleocr库,感兴趣的小伙伴可以去看看,一般的话识别率还是非常高的,官方也有现成的文档帮助我们去搭建,也可以在bilibili上查看一些博主的视频教程。
from paddleocr import PaddleOCR
import re
def runOrc(self,path):
#ocr = PaddleOCR() # need to run only once to download and load model into memory
ocr = PaddleOCR(rec_model_dir=r'c:\paddleOrcModel\rec',
cls_model_dir=r'c:\paddleOrcModel\cls',
det_model_dir=r'c:\paddleOrcModel\det')
#img_path = r"E:\InvoicePics\fapiao.jpg";
img_path = path;
result = ocr.ocr(img_path)
inform = [];
for line in result:
# print(line)
inform.append(line[1][0])
String2 = '【' '】【'.join(inform) '】';
print(String2)
if '发' in String2 or '票' in String2:
invoice2 = re.sub('[a-zA-Z]', '', String2);
# 公司
try:
inform_reverse =inform[::-1];#倒叙取到的所有信息
for item in inform_reverse:
if('公司' in item):
self.gongsi = item
break
except:
self.gongsi = ""
# 发票号码
try:
self.number = re.findall('【([0-9]{8})】', invoice2)[0]
except:
self.number = ""
# 发票代码
try:
self.daima = re.findall('【([0-9]{10,12})】', invoice2)[0];
except:
self.daima = '';
# 发票日期
try:
patter = '([0-9]*[年|月].*?)】'
dateo = re.findall(patter, invoice2)[0]
year = ''.join(re.findall('(2020)|(2021)|(2022)|(2023)|(2024)|(2025)|(2026)|(2027)|(2028)|(2029)|(2030)', dateo)[0])
#month = ''.join(re.findall('(01)|(02)|(03)|(04)|(05)|(06)|(07)|(08)|(09)|(10)|(11)|(12)', dateo)[0])
month=dateo[5:-4]
date = dateo[-3:-1]
self.ymd = year month date;
except:
self.ymd = '';
# 发票金额
try:
amounts = re.findall('([0-9]*?\..*?)】', invoice2);
arr_clean = list()
# 去掉非数字的字符
for elm in amounts:
try:
float(elm)
#print("could convert string to float:", elm)
arr_clean.append(elm)
except ValueError as e:
print(e)
amounts = [float(i) for i in arr_clean];
amounts.sort();
# 取重复的一个数字,如果不存在取第二大的数字
for i in range(1, len(amounts)):
if amounts[i] == amounts[i - 1]:
self.amount = amounts[i]
else:self.amount = amounts[-2];
except:
self.amount = 0
self.OneModel = [self.number, self.daima, self.ymd, self.amount,self.gongsi];
print(self.OneModel)
return self.OneModel
因为现在国税的官方验证网站有反机器人的功能,所以不能采用在页面上进行操作了。那么在拿到发票数据后只能通过大厂提供的API接口去验证了,我这边采用了百度的接口,价格也不太贵。
import requests
'''
OCR-增值税发票验真
'''
'''
invoice_code 代码
invoice_date 日期
invoice_num 编号
total_amount 开票金额
invoice_type 发票类型 special_vat_invoice
'''
def check(invoice_code,invoice_date,invoice_num,total_amount,invoice_type,access_token):
request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/vat_invoice_verification"
params = {"check_code": "", "invoice_code":invoice_code, "invoice_date": invoice_date, "invoice_num":invoice_num,
"invoice_type":invoice_type, "total_amount": total_amount}
request_url = request_url "?access_token=" access_token
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
if response:
print(response.json())
return response.json()
最终实现的效果
最终效果
,