一條爬蟲抓取一個(gè)小網(wǎng)站所有數(shù)據(jù)
? 今天閑來無事,寫一個(gè)爬蟲來玩玩。在網(wǎng)上沖浪的時(shí)候發(fā)現(xiàn)了一個(gè)搞笑的段子網(wǎng),發(fā)現(xiàn)里面的內(nèi)容還是比較有意思的,于是心血來潮,就想著能不能寫一個(gè)Python程序,抓取幾條數(shù)據(jù)下來看看,一不小心就把這個(gè)網(wǎng)站的所有數(shù)據(jù)都拿到了。
? 這個(gè)網(wǎng)站主要的數(shù)據(jù)都是詳情在HTML里面的,可以采用lxml模塊的xpath對(duì)HTML標(biāo)簽的內(nèi)容解析,獲取到自己想要的數(shù)據(jù),然后再保存在本地文件中,整個(gè)過程是一氣呵成的。能夠抓取到一頁的數(shù)據(jù)之后,加一個(gè)循環(huán)就可以抓取到所有頁的數(shù)據(jù),下面的就是數(shù)據(jù)展示。
廢話少說,直接上Python代碼
import requests
import csv
from lxml import etree
import time
class Page:
def __init__(self):
self.pre_url = "https://www.biedoul.com"
self.start_page = 1
self.end_page = 15233
def askHTML(self, current_page, opportunity):
print(
"=============================== current page => " + str(current_page) + "===============================")
try:
pre_url = self.pre_url + "/index/" + str(current_page)
page = requests.get(url=pre_url)
html = etree.HTML(page.content)
articles = html.xpath('/html/body/div/div/div/dl')
return articles
except Exception as e:
if opportunity > 0:
time.sleep(500)
print(
"=============================== retry => " + str(opportunity) + "===============================")
return self.askHTML(current_page, opportunity - 1)
else:
return None
def analyze(self, articles):
lines = []
for article in articles:
data = {}
data["link"] = article.xpath("./span/dd/a/@href")[0]
data["title"] = article.xpath("./span/dd/a/strong/text()")[0]
data["content"] = self.analyze_content(article)
picture_links = article.xpath("./dd/img/@src")
if (picture_links is not None and len(picture_links) > 0):
# print(picture_links)
data["picture_links"] = picture_links
else:
data["picture_links"] = []
# data["good_zan"] = article.xpath("./div/div/a[@class='pinattn good']/p/text()")[0]
# data["bad_bs"] = article.xpath("./div/div/a[@class='pinattn bad']/p/text()")[0]
data["good_zan"] = self.analyze_zan(article, "good")
# article.xpath("./div/div/a[@class='pinattn good']/p/text()")[0]
data["bad_bs"] = self.analyze_zan(article, "bad")
# article.xpath("./div/div/a[@class='pinattn bad']/p/text()")[0]
lines.append(data)
return lines
# 解析文章內(nèi)容
def analyze_content(self, article):
# 1. 判斷dd標(biāo)簽下是否為文本內(nèi)容
content = article.xpath("./dd/text()")
if content is not None and len(content) > 0 and not self.is_empty_list(content):
return content
content = []
p_list = article.xpath("./dd")
for p in p_list:
# 2. 判斷dd/.../font標(biāo)簽下是否為文本內(nèi)容
if len(content) <= 0 or content is None:
fonts = p.xpath(".//font")
for font_html in fonts:
font_content = font_html.xpath("./text()")
if font_content is not None and len(font_content) > 0:
content.append(font_content)
# 3. 判斷dd/.../p標(biāo)簽下是否為文本內(nèi)容
if len(content) <= 0 or content is None:
fonts = p.xpath(".//p")
for font_html in fonts:
font_content = font_html.xpath("./text()")
if font_content is not None and len(font_content) > 0:
content.append(font_content)
return content
def analyze_zan(self, article, type):
num = article.xpath("./div/div/a[@class='pinattn " + type + "']/p/text()")
if num is not None and len(num) > 0:
return num[0]
return 0
def do_word(self):
fieldnames = ['index', 'link', 'title', 'content', 'picture_links', 'good_zan', 'bad_bs']
with open('article.csv', 'a', encoding='UTF8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
# writer.writeheader()
for i in range(self.start_page, self.end_page):
articles = self.askHTML(i, 3)
if articles is None:
continue
article_list = self.analyze(articles)
self.save(writer, article_list)
# 保存到文件中
def save(self, writer, lines):
print("##### 保存中到文件中...")
# python2可以用file替代open
print(lines)
writer.writerows(lines)
print("##### 保存成功...")
def is_empty_list(self, list):
for l in list:
if not self.empty(l):
return False
return True
def empty(self, content):
result = content.replace("\r", "").replace("\n", "")
if result == "":
return True
return False
# 遞歸解析文章內(nèi)容
def analyze_font_content(self, font_html, depth):
content = []
print(depth)
font_content_list = font_html.xpath("./font/text()")
if font_content_list is not None and len(font_content_list) > 0 and not self.is_empty_list(font_content_list):
for font_content in font_content_list:
content.append(font_content)
else:
if depth < 0:
return []
return self.analyze_font_content(font_html.xpath("./font"), depth - 1)
return content
if __name__ == '__main__':
page = Page()
page.do_word()
在運(yùn)行下面的代碼之前,需要先按照好requests、lxml兩個(gè)模塊,安裝命令為:
pip installl requests
pip install lxml
大家對(duì)這個(gè)爬蟲有什么疑問,歡迎給我留言。如果大家對(duì)于我這個(gè)爬蟲創(chuàng)意還不錯(cuò)的話,記得關(guān)注微信公眾號(hào)【智享學(xué)習(xí)】喲,后續(xù)我會(huì)分享更多有意思的編程項(xiàng)目。文章來源:http://www.zghlxwxcb.cn/news/detail-703081.html
本文由博客一文多發(fā)平臺(tái) OpenWrite 發(fā)布!文章來源地址http://www.zghlxwxcb.cn/news/detail-703081.html
到了這里,關(guān)于一條爬蟲抓取一個(gè)小網(wǎng)站所有數(shù)據(jù)的文章就介紹完了。如果您還想了解更多內(nèi)容,請(qǐng)?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!