selenium爬蟲(chóng)——以爬取澎湃新聞某搜索結(jié)果為例
前言
本程序致力于實(shí)現(xiàn)以下目標(biāo):
(1)爬取澎湃新聞關(guān)于“反腐”的全部文章內(nèi)容;
(2)按標(biāo)題、鏈接將其整理到excel中;
(3)將標(biāo)題和文章整合到一個(gè)word文檔中。
許久沒(méi)有正經(jīng)寫(xiě)過(guò)了,有些生疏,代碼耦合度蠻高的,所幸目標(biāo)達(dá)成了。
需要導(dǎo)入的包
import time
import docx
import xlwt
from docx.oxml.ns import qn
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
需要避雷的點(diǎn)
webdriver的版本要與瀏覽器一致
如果用的是google chrome,可以在這里找到新版本的driver;
其他瀏覽器的話(huà)直接百度就能找到。
如果使用爬蟲(chóng)打開(kāi)了新網(wǎng)頁(yè),要記得跳轉(zhuǎn)
一開(kāi)始不知道這一點(diǎn),試了半天都定位不到要爬取的元素,結(jié)果最后發(fā)現(xiàn)一直沒(méi)跳轉(zhuǎn)到第二個(gè)頁(yè)面上,那在第一個(gè)頁(yè)面上當(dāng)然定位不到了……跳轉(zhuǎn)的代碼如下:
new_window = driver.window_handles[1] #找第二個(gè)窗口
driver.switch_to.window(new_window) #切換到新窗口
driver.refresh() #刷新
XPath和selector都可以直接復(fù)制
復(fù)制過(guò)程如下圖所示,比自己寫(xiě)方便多了。
爬取多網(wǎng)頁(yè)時(shí)記得try
比如這次爬取的澎湃新聞的文章,有些鏈接點(diǎn)進(jìn)去是視頻,是我們所不需要的,定位的位置也不一樣,極有可能會(huì)報(bào)錯(cuò)中斷。這時(shí),就需要try-except語(yǔ)句來(lái)幫助我們跳過(guò)了。
try:
x_path="http://main/div[4]/div[1]/div[1]/div/h1"
title=driver.find_element(By.XPATH, x_path)
x_path = "http://main/div[4]/div[1]/div[1]/div/div[2]"
article=driver.find_element(By.XPATH, x_path)
print(title.text)
print(article.text)
file.add_paragraph(article.text)
except:
print("非文字")
打入word時(shí)調(diào)整字體的問(wèn)題
具體程序如下:
for para in file.paragraphs:
for run in para.runs:
run.font.size = docx.shared.Pt(10) #設(shè)置字體大小為10
run.font.name = 'Times New Roman' #英文
run._element.rPr.rFonts.set(qn('w:eastAsia'), u'楷體') # 中文
值得注意的是,中文的字體前面最好加一個(gè)u,而且qn需要單獨(dú)導(dǎo)包:
from docx.oxml.ns import qn
完整程序
import time
import docx
import xlwt
from docx.oxml.ns import qn
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
def main():
driver = webdriver.Edge()
driver.get("https://www.thepaper.cn/")
time.sleep(1)
search=driver.find_element(By.TAG_NAME,'input')
search.send_keys("反腐")
time.sleep(1)
x_path="http://main/div/div/div/div/div/div/div/span"
send_button=driver.find_element(By.XPATH,x_path)
ActionChains(driver).move_to_element(send_button).click(send_button).perform()
time.sleep(1)
x_path="http://main/div[3]/div[1]/div/div[2]/div/ul/li[2]"
send_button=driver.find_element(By.XPATH,x_path)
ActionChains(driver).move_to_element(send_button).click(send_button).perform()
time.sleep(1)
last_height = driver.execute_script("return document.body.scrollHeight") # 獲取當(dāng)前頁(yè)面的高度
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
last_height = driver.execute_script("return document.body.scrollHeight")
while True: # 模擬下拉操作,直到滑動(dòng)到底部
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # 模擬下拉操作
time.sleep(2) # 等待頁(yè)面加載
new_height = driver.execute_script("return document.body.scrollHeight") # 獲取當(dāng)前頁(yè)面的高度
if new_height == last_height: # 判斷是否已經(jīng)到達(dá)頁(yè)面底部
break
last_height = new_height
x_path="http://main/div[3]/div[1]/div/div/div/ul/li/div/a"
names=driver.find_elements(By.XPATH,x_path)
name_text=[]
name_href=[]
num=-1
for name in names:
name_text.append(name.text)
name_href.append(name.get_attribute("href"))
num=num+1
print(name.text)
print(name.get_attribute("href"))
file=docx.Document() #創(chuàng)建docx對(duì)象
workbook = xlwt.Workbook()
sheet1 = workbook.add_sheet('sheet1', cell_overwrite_ok=True)
sheet1.write(0,0,'標(biāo)題')
sheet1.write(0,1,'鏈接')
for i in range(num+1):
print(name_text[i])
print(name_href[i])
address=name_href[i]
driver.get(address)
file.add_paragraph(name_text[i])
sheet1.write(i+1,0,name_text[i])
sheet1.write(i + 1, 1, name_href[i])
try:
x_path="http://main/div[4]/div[1]/div[1]/div/h1"
title=driver.find_element(By.XPATH, x_path)
x_path = "http://main/div[4]/div[1]/div[1]/div/div[2]"
article=driver.find_element(By.XPATH, x_path)
print(title.text)
print(article.text)
file.add_paragraph(article.text)
except:
print("非文字")
for para in file.paragraphs:
for run in para.runs:
run.font.size = docx.shared.Pt(10) #設(shè)置字體大小為10
run.font.name = 'Times New Roman' #英文
run._element.rPr.rFonts.set(qn('w:eastAsia'), u'楷體') # 中文
file.save("crawlerResult.docx")
workbook.save('./crawlerResult.xls')
if __name__=='__main__':
main()
擴(kuò)展
現(xiàn)將功能擴(kuò)展如下:
(1)爬取分別以“反腐”,“從嚴(yán)治黨”,“廉潔”,三個(gè)關(guān)鍵詞搜索的文章內(nèi)容并存儲(chǔ);
(2)只保留不重復(fù)的部分。
為實(shí)現(xiàn)該功能,需要一個(gè)字典,來(lái)判斷該文章是否已經(jīng)被搜索過(guò):
dict={} #記錄是否重復(fù)的字典
names=driver.find_elements(By.XPATH,x_path)
for name in names:
if name.text not in dict:
name_text.append(name.text)
name_href.append(name.get_attribute("href"))
num=num+1
print(name.text)
print(name.get_attribute("href"))
dict[name.text]=1
另外發(fā)現(xiàn),爬取過(guò)程中可能出現(xiàn)某網(wǎng)址已經(jīng)失效的情況,在這種情況下需要跳過(guò),否則程序也會(huì)因執(zhí)行不下去而異常結(jié)束,此處使用try-except處理:
try:
address=name_href[i]
driver.get(address)
except:
print("網(wǎng)址失效")
擴(kuò)展之后的程序如下:文章來(lái)源:http://www.zghlxwxcb.cn/news/detail-741229.html
import time
import docx
import xlwt
from docx.oxml.ns import qn
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
def main():
search_word=['反腐','從嚴(yán)治黨','廉潔']
search_word_len=search_word.__len__()
dict={} #記錄是否重復(fù)的字典
num = -1 #記錄標(biāo)題數(shù)
search_word_num=0 #搜索到第幾個(gè)詞
name_text = []
name_href = []
for word in search_word:
search_word_num=search_word_num+1
driver = webdriver.Edge()
driver.get("https://www.thepaper.cn/")
time.sleep(1)
search=driver.find_element(By.TAG_NAME,'input')
#print(word)
search.send_keys(word)
time.sleep(1)
x_path="http://main/div/div/div/div/div/div/div/span"
send_button=driver.find_element(By.XPATH,x_path)
ActionChains(driver).move_to_element(send_button).click(send_button).perform()
time.sleep(1)
x_path="http://main/div[3]/div[1]/div/div[2]/div/ul/li[2]"
send_button=driver.find_element(By.XPATH,x_path)
ActionChains(driver).move_to_element(send_button).click(send_button).perform()
time.sleep(1)
last_height = driver.execute_script("return document.body.scrollHeight") # 獲取當(dāng)前頁(yè)面的高度
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
last_height = driver.execute_script("return document.body.scrollHeight")
while True: # 模擬下拉操作,直到滑動(dòng)到底部
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # 模擬下拉操作
time.sleep(2) # 等待頁(yè)面加載
new_height = driver.execute_script("return document.body.scrollHeight") # 獲取當(dāng)前頁(yè)面的高度
if new_height == last_height: # 判斷是否已經(jīng)到達(dá)頁(yè)面底部
break
last_height = new_height
x_path="http://main/div[3]/div[1]/div/div/div/ul/li/div/a"
names=driver.find_elements(By.XPATH,x_path)
for name in names:
if name.text not in dict:
name_text.append(name.text)
name_href.append(name.get_attribute("href"))
num=num+1
print(name.text)
print(name.get_attribute("href"))
dict[name.text]=1
if search_word_num == search_word_len:
file=docx.Document() #創(chuàng)建docx對(duì)象
workbook = xlwt.Workbook()
sheet1 = workbook.add_sheet('sheet1', cell_overwrite_ok=True)
sheet1.write(0,0,'標(biāo)題')
sheet1.write(0,1,'鏈接')
for i in range(num+1):
print(name_text[i])
print(name_href[i])
try:
address=name_href[i]
driver.get(address)
except:
print("網(wǎng)址失效")
file.add_paragraph(name_text[i])
sheet1.write(i+1,0,name_text[i])
sheet1.write(i + 1, 1, name_href[i])
try:
x_path="http://main/div[4]/div[1]/div[1]/div/h1"
title=driver.find_element(By.XPATH, x_path)
x_path = "http://main/div[4]/div[1]/div[1]/div/div[2]"
article=driver.find_element(By.XPATH, x_path)
print(title.text)
print(article.text)
file.add_paragraph(article.text)
except:
print("非文字")
for para in file.paragraphs:
for run in para.runs:
run.font.size = docx.shared.Pt(10) #設(shè)置字體大小為10
run.font.name = 'Times New Roman' #英文
run._element.rPr.rFonts.set(qn('w:eastAsia'), u'楷體') # 中文
file.save("crawlerResult.docx")
workbook.save('./crawlerResult.xls')
else:
driver.close()
print(dict.keys())
if __name__=='__main__':
main()
爬取效果
word共2203頁(yè)324萬(wàn)字
excel共1768行(1767個(gè)文章標(biāo)題,第一行為表頭)文章來(lái)源地址http://www.zghlxwxcb.cn/news/detail-741229.html
到了這里,關(guān)于selenium爬蟲(chóng)——以爬取澎湃新聞某搜索結(jié)果為例的文章就介紹完了。如果您還想了解更多內(nèi)容,請(qǐng)?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!