首先所需要的環(huán)境:(我用的是Python2的,可以選擇python3,具體遇到的問(wèn)題自行解決,目前我這邊幾百萬(wàn)的數(shù)據(jù)量爬?。?br> 環(huán)境:
Python 2.7.10
Scrapy Scrapy 1.5.0
第三方庫(kù):
PyMySQL==0.8.0
Scrapy==1.5.0
pytesseract==0.2.0
pip==10.0.1
Pillow==5.1.0
logger==1.4
bs4==0.0.1
requests==2.18.4
創(chuàng)建項(xiàng)目
scrapy startproject mytest
創(chuàng)建爬蟲(chóng)程序
cd mytest
scrapy genspider name XXX.com
直接貼代碼具體需要注意的特殊顏色標(biāo)出有注釋文章來(lái)源:http://www.zghlxwxcb.cn/news/detail-801032.html
-- coding: utf-8 --
import scrapy
import pytesseract #驗(yàn)證碼識(shí)別庫(kù)
from PIL import Image #驗(yàn)證碼圖片處理
from scrapy.http import Request
from yishi.items import YishiItem #items定義爬取字段
from yishi.settings import MYSQL_HOST, MYSQL_DBNAME, MYSQL_USER, MYSQL_PASSWD #settings數(shù)據(jù)庫(kù)配置
import pymysql #連接數(shù)據(jù)庫(kù)
import logging #打印日志
#設(shè)置日志
log_filename = '../static/data/info.log'
logging.basicConfig(filename=log_filename, filemode='a', level=logging.INFO)
class CreditSpider(scrapy.Spider):
name = 'name'
baseURL = 'https://xxx.com'
#start_urls = ''
#設(shè)置headers,打開(kāi)網(wǎng)頁(yè)直接看請(qǐng)求headers復(fù)制進(jìn)去就可以了
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Host': 'xxx',
'Upgrade-Insecure-Requests': 1,
'User-Agent': 'xxx',
}
#數(shù)據(jù)庫(kù)
connect = pymysql.connect(
host=MYSQL_HOST,
db=MYSQL_DBNAME,
user=MYSQL_USER,
passwd=MYSQL_PASSWD,
charset='utf8',
use_unicode=True)
#重寫(xiě)start_requests
def start_requests(self):
return [Request(self.baseURL+'xxx',
headers=self.headers,
callback=self.parse,
dont_filter=True, #scrapy會(huì)對(duì)request的URL去重(RFPDupeFilter),加上dont_filter則告訴它這個(gè)URL不參與去重
)
]
#首先需要請(qǐng)求一次網(wǎng)頁(yè)
def parse(self, response):
#每次查詢(xún)1條數(shù)據(jù),搜索列表所需要的條件
cursor = self.connect.cursor()
sql = 'select id,xxx,xxx,xxx from xxx where xxx order by id limit 1'
cursor.execute(sql)
res = cursor.fetchall()
if res:
#請(qǐng)求網(wǎng)站所需要的參數(shù),搜索條件
data = {
"xxx": res[0][1],
"xxx": '',
"xxx": '',
"xxx": res[0][2],
"xxx": '',
"xxx": '',
"xxx": '',
}
cursor.close()
return scrapy.Request(self.baseURL + '/xxx/captcha', #驗(yàn)證碼圖片地址
headers=self.headers,
meta={'data': data, 'dr_id': res[0][0], 'static': res[0][3], 'len': len(res)}, #第一次請(qǐng)求的參數(shù)傳給下次請(qǐng)求,可以保存cookie之類(lèi)的
callback=self.creditRes,
dont_filter=True
)
else:
#數(shù)據(jù)表中條件沒(méi)有的時(shí)候結(jié)束爬蟲(chóng),每次爬取要更新下條件表
print '執(zhí)行完畢!'
pass
#再次請(qǐng)求存驗(yàn)證碼圖片
def creditRes(self, response):
#保存驗(yàn)證碼
captchaFile = '../static/images/code/captcha.png'
with open(captchaFile, 'wb') as f:
f.write(response.body)
try:
#pytesseract識(shí)別驗(yàn)證碼
image = Image.open(captchaFile)
captcha_value = pytesseract.image_to_string(image)
print '驗(yàn)證碼為:'+captcha_value
except:
#驗(yàn)證碼失敗 重新請(qǐng)求
logging.info('驗(yàn)證碼獲取失敗')
return self.start_urls
#識(shí)別后的驗(yàn)證碼作為參數(shù)使用
data = response.meta.get("data")
data["validCode"] = captcha_value
return [scrapy.FormRequest(
url=self.baseURL+'xxx', #帶上全部參數(shù)再次請(qǐng)求取數(shù)據(jù)
formdata=data,
method='GET',
meta={'dr_id': response.meta.get("dr_id"), 'static': response.meta.get("static"), 'len': response.meta.get("len"),
'captcha_value': captcha_value}, #帶上部分參數(shù)保存或更新?tīng)顟B(tài)用
headers=self.headers,
callback=self.creditdata,
dont_filter=True,
)]
def creditdata(self, response):
#獲取驗(yàn)證碼錯(cuò)誤內(nèi)容,識(shí)別驗(yàn)證是否成功
code_data = response.xpath("http://span[@class='error']")
if code_data:
code = code_data.xpath(".//text()").extract()[0].decode('UTF-8')
logging.info('驗(yàn)證碼校驗(yàn)失敗,驗(yàn)證碼:'+str(response.meta.get("captcha_value")))
else:
code = ''
#驗(yàn)證碼錯(cuò)誤時(shí)不更新?tīng)顟B(tài),繼續(xù)重復(fù)爬取
dr_id = response.meta.get("dr_id")
#不存在驗(yàn)證碼識(shí)別更新?tīng)顟B(tài),插入數(shù)據(jù)
if code.strip() not in ('驗(yàn)證碼錯(cuò)誤', '驗(yàn)證碼不能為空'):
cursor = self.connect.cursor()
sql = 'update xxx set status=%s where id=%s' % (1, dr_id)
cursor.execute(sql)
self.connect.commit()
cursor.close()
else:
#驗(yàn)證碼失敗不更新?tīng)顟B(tài)
logging.info('驗(yàn)證碼錯(cuò)誤')
node_list = response.xpath("http://table[@id='formresult']/tbody/tr")
# 更新?tīng)顟B(tài) 0還未抓取數(shù)據(jù) 1已經(jīng)抓取
logging.info('當(dāng)前執(zhí)行條件表id為'+ str(dr_id))
if node_list:
for node in node_list:
item = YishiItem()
item['xxx'] = dr_id
item['xxx'] = node.xpath(".//td[1]/text()").extract()[0].decode('UTF-8')
item['xxx'] = node.xpath(".//td[2]/text()").extract()[0].decode('UTF-8')
item['xxx'] = node.xpath(".//td[3]/text()").extract()[0].decode('UTF-8')
item['xxx'] = node.xpath(".//td[4]/text()").extract()[0].decode('UTF-8')
item['xxx'] = node.xpath(".//td[5]/text()").extract()[0].decode('UTF-8')
item['xxx'] = node.xpath(".//td[6]/text()").extract()[0].decode('UTF-8')
item['xxx'] = node.xpath(".//td[7]/text()").extract()[0].decode('UTF-8')
yield item
#分頁(yè)數(shù)據(jù),根據(jù)下一頁(yè)爬取,可獲取下頁(yè)按鈕狀態(tài)去爬取分頁(yè)數(shù)據(jù)
nextPage = response.xpath("http://a[@class='disable' and @class='next']")
if nextPage:
if not len(nextPage):
#下一頁(yè)a標(biāo)簽url
url = response.xpath("http://a[@class='disable' and @class='next']/@href").extract()[0]
yield scrapy.Request(self.baseURL+'/'+url, callback=self.creditdata)
# 根據(jù)狀態(tài)status=0判斷是否繼續(xù)爬取數(shù)據(jù)
len = response.meta.get("len")
if not len == 0:
yield scrapy.Request(self.baseURL+'xxx',
headers=self.headers,
callback=self.parse,
dont_filter=True)
items設(shè)置:
xxx = scrapy.Field()
xxx = scrapy.Field()
...
pipelines存數(shù)據(jù)庫(kù)這個(gè)就不說(shuō)了根據(jù)自己的業(yè)務(wù)
注:目前我網(wǎng)站驗(yàn)證碼比較簡(jiǎn)單可以直接使用pytesseract,識(shí)別率95%以上,也可以用別的方式CNN等方式識(shí)別
個(gè)人感覺(jué)用 requests.get() 方式寫(xiě)要簡(jiǎn)單一些,本地已測(cè)試過(guò),根據(jù)業(yè)務(wù)需求用scrapy完成的。
requests.get() 主要問(wèn)題就是 session = requests.session() 這句是重點(diǎn)文章來(lái)源地址http://www.zghlxwxcb.cn/news/detail-801032.html
到了這里,關(guān)于Python scrapy爬取帶驗(yàn)證碼的列表數(shù)據(jù)的文章就介紹完了。如果您還想了解更多內(nèi)容,請(qǐng)?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!