import requests
import re
import json
import os
session = requests.session()
def fetch_url(url):
??? return session.get(url).content.decode('gbk')
def get_doc_id(url):
??? return re.findall('view/(.*).html', url)[0]
def parse_type(content):
??? return re.findall(r"docType.*?\:.*?\'(.*?)\'\,", content)[0]
def parse_title(content):
??? return re.findall(r"title.*?\:.*?\'(.*?)\'\,", content)[0]文章來(lái)源:http://www.zghlxwxcb.cn/news/detail-422823.html
def parse_doc(content):
??? result = ''
??? url_list = re.findall('(https.*?0.json.*?)\\\\x22}', content)
??? url_list = [addr.replace("\\\\\\/", "/") for addr in url_list]
??? for url in url_list[:-5]:
??????? content = fetch_url(url)
??????? y = 0
??????? txtlists = re.findall('"c":"(.*?)".*?"y":文章來(lái)源地址http://www.zghlxwxcb.cn/news/detail-422823.html
到了這里,關(guān)于百度文庫(kù)爬蟲(chóng)(爬取需要下載券的文檔)的文章就介紹完了。如果您還想了解更多內(nèi)容,請(qǐng)?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!