表格文字識(shí)別(異步接口)
圖片轉(zhuǎn)excel
百度ai官方文檔:https://ai.baidu.com/ai-doc/OCR/Ik3h7y238
使用的是表格文字識(shí)別(異步接口),同步接口已經(jīng)下線
文章來源:http://www.zghlxwxcb.cn/news/detail-656047.html
import requests
import json
import base64
import time
'''
文檔:https://ai.baidu.com/ai-doc/OCR/Ik3h7y238
'''
# 獲取access_token地址:https://console.bce.baidu.com/ai/#/ai/ocr/app/list
def get_access_token():
client_id = "xxxxxxxxxxxxxxxxxx" # 你的apikey
client_secret = "xxxxxxxxxxxxxxxxxxxxxx" # 你的Secret Key
host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={}&client_secret={}'.format(
client_id, client_secret)
response = requests.get(host).text
data = json.loads(response)
access_token = data['access_token']
return access_token
# 獲取識(shí)別結(jié)果
def get_info(access_token):
request_url = "https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/request"
# 二進(jìn)制方式打開圖片文件
f = open('1.jpg', 'rb')
img = base64.b64encode(f.read()) # base64編碼
params = {"image": img}
request_url = request_url + "?access_token=" + access_token
headers = {'content-type': 'application/x-www-form-urlencoded'}
response = requests.post(request_url, data=params, headers=headers)
# if response:
# print(response.json())
data_1 = response.json()
return data_1
# 獲取excel
def get_excel(requests_id, access_token):
headers = {'content-type': 'application/x-www-form-urlencoded'}
pargams = {
'request_id': requests_id,
'result_type': 'excel'
}
url = 'https://aip.baidubce.com/rest/2.0/solution/v1/form_ocr/get_request_result'
url_all = url + "?access_token=" + access_token
res = requests.post(url_all, headers=headers, params=pargams) # 訪問鏈接獲取excel下載頁
info_1 = res.json()['result']['ret_msg']
excel_url = res.json()['result']['result_data']
excel_1 = requests.get(excel_url).content
print(excel_1)
with open('識(shí)別結(jié)果.xls', 'wb+') as f:
f.write(excel_1)
print(info_1)
def main():
print('正在處理中請(qǐng)稍后')
access_token = get_access_token()
data_1 = get_info(access_token)
try:
requests_id = data_1['result'][0]['request_id']
if requests_id != '':
print('識(shí)別完成')
except:
print('識(shí)別錯(cuò)誤')
print('正在獲取excel')
time.sleep(10) # 延時(shí)十秒讓網(wǎng)頁圖片轉(zhuǎn)excel完畢,excel量多的話,轉(zhuǎn)化會(huì)慢,可以延時(shí)長(zhǎng)一點(diǎn)
get_excel(requests_id, access_token)
main()
表格文字識(shí)別V2
圖片/pdf轉(zhuǎn)excel通用
import requests
import json
import base64
CLIENT_ID = "xxxxxxxxxxxxxxxxx" # 你的apikey,需要修改
CLIENT_SECRET = "xxxxxxxxxxxxxxxxxxxxx" # 你的Secret Key,需要修改
# 獲取access_token
def get_access_token():
auth_url = 'https://aip.baidubce.com/oauth/2.0/token'
params = {
'grant_type': 'client_credentials',
'client_id': CLIENT_ID,
'client_secret': CLIENT_SECRET,
}
response = requests.post(auth_url, data=params)
data = response.json()
access_token = data.get('access_token')
if not access_token:
raise "請(qǐng)輸入正確的client_id 和 client_secret"
return access_token
def save_excel(b64_excel, excel_name):
# 將base64編碼的excel文件解碼并保存為本地文件
excel = base64.b64decode(b64_excel)
with open(excel_name, 'wb') as f:
f.write(excel)
def to_excel(file_path, excel_name):
access_token = get_access_token()
request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/table"
# 以二進(jìn)制方式打開圖片文件,并將其轉(zhuǎn)換為base64編碼
with open(file_path, 'rb') as f:
file = base64.b64encode(f.read())
ext = file_path.split('.')[-1]
if ext in ['jpg', 'jpeg', 'png', 'bmp']:
# 圖片格式
data = {
"image": file,
"return_excel": 'true',
}
elif ext == 'pdf':
# pdf格式
data = {
"pdf_file": file,
"return_excel": 'true',
}
headers = {'content-type': 'application/x-www-form-urlencoded'}
# 發(fā)送POST請(qǐng)求進(jìn)行表格文字識(shí)別
response = requests.post(request_url, params={'access_token': access_token}, data=data, headers=headers)
if response.ok:
data = response.json()
# 將返回的excel文件保存到本地
save_excel(data.get('excel_file', ''), excel_name)
print('轉(zhuǎn)換完成')
else:
print('轉(zhuǎn)換失敗')
if __name__ == '__main__':
img_path = '1.png' # 要轉(zhuǎn)換的圖片文件名
pdf_path = 'table.pdf' # 要轉(zhuǎn)換的pdf文件名
to_excel(file_path=img_path, excel_name='out_pic.xlsx') # 轉(zhuǎn)換后的excel文件名
to_excel(file_path=pdf_path, excel_name='out_pdf.xlsx') # 轉(zhuǎn)換后的excel文件名
文章來源地址http://www.zghlxwxcb.cn/news/detail-656047.html
到了這里,關(guān)于python調(diào)用百度ai將圖片/pdf識(shí)別為表格excel的文章就介紹完了。如果您還想了解更多內(nèi)容,請(qǐng)?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!