Gitee倉庫地址:travel-server:景點旅游項目服務(wù)端
3.基于Python中的Selenium爬取攜程旅游網(wǎng)景點詳細(xì)數(shù)據(jù)
3.1前提環(huán)境
- 確保安裝python3.x環(huán)境
- 以管理員身份打開cmd,安裝selenium、pymysql、datetime,默認(rèn)安裝最新版即可
pip install selenium
pip install pymysql
pip install datetime
-
確保chrome安裝對應(yīng)版本的驅(qū)動(將該驅(qū)動放在chrome安裝路徑下),用于控制chrome瀏覽器,并將路徑添加到環(huán)境變量的Path變量中,如圖所示!
#安裝chrome驅(qū)動教程鏈接: https://blog.csdn.net/linglong_L/article/details/136283810
3.2思路
- 搜索指定城市景點,網(wǎng)站通過分頁進(jìn)行展示;
- 使用selenium每個景點的詳細(xì)訪問路徑,并點擊該路徑獲取詳細(xì)景點信息,再通過正則表達(dá)式獲取需要的內(nèi)容;
-
如下圖,景點的詳細(xì)信息有:景點名稱、景點等級(1-5A)、景點地址、開放時間(有兩種,我們采用下面的)、聯(lián)系電話、景點介紹、景點圖片等內(nèi)容
3.3代碼詳講
3.3.1查詢指定城市的所有景點
- 控制打開chrome,并訪問指定查詢所有景點路徑
def __init__(self):
options = Options()
options.add_argument('--headless')
service = Service()
self.chrome = Chrome(service=service)
self.chrome.get(
'https://huodong.ctrip.com/things-to-do/list?pagetype=city&citytype=dt&keyword=%E6%A2%85%E5%B7%9E&id=523&name=%E6%A2%85%E5%B7%9E&pshowcode=Ticket2&kwdfrom=srch&bookingtransactionid=1711160613361_6064')
time.sleep(3)
self.page = 1
self.headers = {
'cookie': 'suid=lh/P1+4RKuhAYg684ErS+g==; suid=lh/P1+4RKuhAYg684ErS+g==',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
}
3.3.2獲取詳細(xì)景點的訪問路徑
- 使用selenium的根據(jù)class定位元素方法,找到詳細(xì)景點的href屬性,即為該景點的訪問路徑
- 并通過page屬性控制訪問的頁數(shù)
#獲取景點請求路徑
def get_url(self):
while True:
content = self.chrome.find_element(By.CLASS_NAME, "right-content-list").get_attribute('innerHTML')
cons = re.findall(r'href="(.*?)" title="(.*?)"', content)
for con in cons:
self.detail_url = 'https:' + con[0]
self.title = con[1]
print(self.detail_url, self.title)
self.get_detail()
self.chrome.find_element(By.CLASS_NAME,'u_icon_enArrowforward').click()
time.sleep(1)
self.page += 1
if self.page == 120:
break
3.3.3獲取景點的詳細(xì)信息
- 景點的詳細(xì)信息有:景點名稱、景點等級(1-5A)、景點地址、開放時間(有兩種,我們采用下面的)、聯(lián)系電話、景點介紹、景點圖片等內(nèi)容
- 通過正則表達(dá)式獲取,詳細(xì)代碼如下:
- 并每次獲取詳細(xì)信息之后,將信息保存到mysql數(shù)據(jù)庫中
def get_detail(self):
detail_con = requests.get(self.detail_url, verify=False, headers=self.headers).text
# time.sleep(2)
'''使用正則獲取信息'''
self.title = ''.join(re.findall(r'<div class="title"><h1>(.*?)<', detail_con, re.DOTALL))
print('景點名稱:'+self.title)
#self.rank = ''.join(re.findall(r'rankText">(.*?)<', detail_con, re.DOTALL))
self.address = ''.join(re.findall(r'地址</p><p class="baseInfoText">(.*?)<', detail_con, re.DOTALL))
self.mobile = ''.join(re.findall(r'官方電話</p><p class="baseInfoText">(.*?)<', detail_con, re.DOTALL))
self.quality_grade= ''.join(re.findall(r'<div class="titleTips"><span>(.*?)<!--', detail_con, re.DOTALL))
#self.openTime = ''.join(re.findall(r'開放時間</div><div class="moduleContent">(.*?)<', detail_con, re.DOTALL))
first_three_characters = self.address[:3]
print('所在省份城市:'+'廣東省'+first_three_characters)
print('詳細(xì)地址:'+self.address)
#print('開放時間:'+self.openTime)
print('電話:'+self.mobile)
print('等級:'+self.quality_grade)
if self.quality_grade=='':
self.quality_grade=0
'''使用xpath獲取信息'''
ret = etree.HTML(detail_con)
desc_cons = ret.xpath('//div[@class="detailModule normalModule"]//div[@class="moduleContent"]')
desc_titles = ret.xpath('//div[@class="detailModule normalModule"]//div[@class="moduleTitle"]')
desc_list = []
desc_title_list = []
for d in desc_cons:
des = ''.join(d.xpath('.//text()'))
desc_list.append(des)
for d in desc_titles:
des = ''.join(d.xpath('.//text()'))
desc_title_list.append(des)
desc_dict = dict(zip(desc_title_list, desc_list))
#print(desc_dict)
first_value = list(desc_dict.values())[:2] # 獲取前兩個值
if len(first_value) >= 1:
introduction = first_value[0]
else:
introduction = ''
if len(first_value) >= 2:
opening_hours = first_value[1]
else:
opening_hours = ''
print('介紹:'+introduction)
print('開放時間:'+ opening_hours)
'''獲取圖片鏈接'''
img_list = []
imgs = re.findall(r'background-image:url\((.*?)\)', detail_con, re.DOTALL)
for img in imgs:
'''匹配到的同一張圖片會有兩種尺寸,我們只要大圖,所以把尺寸為521*391的匹配出來即可'''
image = re.search(r'521_391', img)
if image:
img_list.append(img)
print(",".join(img_list))
conn = pymysql.connect(host='localhost', user='root', password='root',
database='travel_ams', charset='utf8mb4')
cursor = conn.cursor()
sql = "INSERT INTO ams_attraction (attraction_name, quality_grade, province_city, location, open_hour, phone, introduction, images, add_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (self.title, self.quality_grade, '廣東省'+first_three_characters, self.address, opening_hours, self.mobile, introduction,",".join(img_list) ,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
cursor.execute(sql, values)
conn.commit()
conn.close()
#self.get_ticket()
3.4數(shù)據(jù)庫設(shè)計
travel_ams數(shù)據(jù)庫的ams_attraction表
字段名 | 字段類型 | 是否為主鍵 | 是否有唯一約束 | 是否有非空約束 | 注釋 |
---|---|---|---|---|---|
attraction_id | int | 是 | 景點id,自增 | ||
attraction_name | varchar(20) | 景點名稱 | |||
resource_type_id | int | 景點資源類型id | |||
quality_grade | int | 景點等級 | |||
province_city | varchar(20) | 景點所在省份城市 | |||
location | varchar(1000) | 詳細(xì)位置 | |||
open_hour | varchar(1000) | 開放時間 | |||
phone | varchar(1000) | 電話 | |||
introduction | varchar(10000) | 景點介紹 | |||
images | varchar(1000) | 景點圖片列表 | |||
staus | int | 狀態(tài)【1為顯示,0為不顯示】 | |||
add_time | datetime | 添加時間 | |||
update_time | datetime | 修改時間 |
創(chuàng)建表語句如下:文章來源:http://www.zghlxwxcb.cn/news/detail-849455.html
CREATE TABLE ams_attraction (
attraction_id INT PRIMARY KEY AUTO_INCREMENT COMMENT '景點id,自增',
attraction_name VARCHAR(20) COMMENT '景點名稱',
resource_type_id INT COMMENT '景點資源類型id',
province_city VARCHAR(20) COMMENT '景點所在省份城市',
location VARCHAR(1000) COMMENT '詳細(xì)位置',
open_hour VARCHAR(1000) COMMENT '開放時間',
phone VARCHAR(1000) COMMENT '電話',
introduction VARCHAR(10000) COMMENT '景點介紹',
images VARCHAR(1000) COMMENT '景點圖片列表',
status INT COMMENT '狀態(tài)【1為顯示,0為不顯示】',
add_time DATETIME COMMENT '添加時間',
update_time DATETIME COMMENT '修改時間'
);
3.5全部代碼
- 執(zhí)行該main方法,即可完成導(dǎo)入指定訪問路徑的景點數(shù)據(jù)
- 可以在控制臺查詢是否導(dǎo)入成功
import pandas
import re
import time
import requests
import urllib3
from lxml import etree
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import pymysql
import datetime
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
class Jy_jd(object):
def __init__(self):
options = Options()
options.add_argument('--headless')
service = Service()
self.chrome = Chrome(service=service)
self.chrome.get(
'https://huodong.ctrip.com/things-to-do/list?pagetype=city&citytype=dt&keyword=%E6%A2%85%E5%B7%9E&id=523&name=%E6%A2%85%E5%B7%9E&pshowcode=Ticket2&kwdfrom=srch&bookingtransactionid=1711160613361_6064')
time.sleep(3)
self.page = 1
self.headers = {
'cookie': 'suid=lh/P1+4RKuhAYg684ErS+g==; suid=lh/P1+4RKuhAYg684ErS+g==',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
}
#獲取景點請求路徑
def get_url(self):
while True:
content = self.chrome.find_element(By.CLASS_NAME, "right-content-list").get_attribute('innerHTML')
cons = re.findall(r'href="(.*?)" title="(.*?)"', content)
for con in cons:
self.detail_url = 'https:' + con[0]
self.title = con[1]
print(self.detail_url, self.title)
self.get_detail()
self.chrome.find_element(By.CLASS_NAME,'u_icon_enArrowforward').click()
time.sleep(1)
self.page += 1
if self.page == 120:
break
def get_detail(self):
detail_con = requests.get(self.detail_url, verify=False, headers=self.headers).text
# time.sleep(2)
'''使用正則獲取信息'''
self.title = ''.join(re.findall(r'<div class="title"><h1>(.*?)<', detail_con, re.DOTALL))
print('景點名稱:'+self.title)
#self.rank = ''.join(re.findall(r'rankText">(.*?)<', detail_con, re.DOTALL))
self.address = ''.join(re.findall(r'地址</p><p class="baseInfoText">(.*?)<', detail_con, re.DOTALL))
self.mobile = ''.join(re.findall(r'官方電話</p><p class="baseInfoText">(.*?)<', detail_con, re.DOTALL))
self.quality_grade= ''.join(re.findall(r'<div class="titleTips"><span>(.*?)<!--', detail_con, re.DOTALL))
#self.openTime = ''.join(re.findall(r'開放時間</div><div class="moduleContent">(.*?)<', detail_con, re.DOTALL))
first_three_characters = self.address[:3]
print('所在省份城市:'+'廣東省'+first_three_characters)
print('詳細(xì)地址:'+self.address)
#print('開放時間:'+self.openTime)
print('電話:'+self.mobile)
print('等級:'+self.quality_grade)
if self.quality_grade=='':
self.quality_grade=0
'''使用xpath獲取信息'''
ret = etree.HTML(detail_con)
desc_cons = ret.xpath('//div[@class="detailModule normalModule"]//div[@class="moduleContent"]')
desc_titles = ret.xpath('//div[@class="detailModule normalModule"]//div[@class="moduleTitle"]')
desc_list = []
desc_title_list = []
for d in desc_cons:
des = ''.join(d.xpath('.//text()'))
desc_list.append(des)
for d in desc_titles:
des = ''.join(d.xpath('.//text()'))
desc_title_list.append(des)
desc_dict = dict(zip(desc_title_list, desc_list))
#print(desc_dict)
first_value = list(desc_dict.values())[:2] # 獲取前兩個值
if len(first_value) >= 1:
introduction = first_value[0]
else:
introduction = ''
if len(first_value) >= 2:
opening_hours = first_value[1]
else:
opening_hours = ''
print('介紹:'+introduction)
print('開放時間:'+ opening_hours)
'''獲取圖片鏈接'''
img_list = []
imgs = re.findall(r'background-image:url\((.*?)\)', detail_con, re.DOTALL)
for img in imgs:
'''匹配到的同一張圖片會有兩種尺寸,我們只要大圖,所以把尺寸為521*391的匹配出來即可'''
image = re.search(r'521_391', img)
if image:
img_list.append(img)
print(",".join(img_list))
conn = pymysql.connect(host='localhost', user='root', password='root',
database='travel_ams', charset='utf8mb4')
cursor = conn.cursor()
sql = "INSERT INTO ams_attraction (attraction_name, quality_grade, province_city, location, open_hour, phone, introduction, images, add_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
values = (self.title, self.quality_grade, '廣東省'+first_three_characters, self.address, opening_hours, self.mobile, introduction,",".join(img_list) ,datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
cursor.execute(sql, values)
conn.commit()
conn.close()
#self.get_ticket()
#獲取門票
def get_ticket(self):
id = self.detail_url.split('/')[-1]
print(id)
ticket_url = f'https://piao.ctrip.com/ticket/dest/{id}?onlyContent=true&onlyShelf=true'
ticket_res = requests.get(ticket_url, verify=False, headers=self.headers).text
# time.sleep(1)
ticket_ret = etree.HTML(ticket_res)
ticket = ticket_ret.xpath('//table[@class="ticket-table"]//div[@class="ttd-fs-18"]/text()')
price = ticket_ret.xpath(
'//table[@class="ticket-table"]//td[@class="td-price"]//strong[@class="ttd-fs-24"]/text()')
print(ticket)
print(price)
'''拿到的列表里可能存在不確定數(shù)量的空值,所以這里用while True把空值全部刪除,這樣才可以確保門票種類與價格正確對應(yīng)上'''
while True:
try:
ticket.remove(' ')
except:
break
while True:
try:
price.remove(' ')
except:
break
'''
這里多一個if判斷是因為我發(fā)現(xiàn)有些詳情頁即便拿到門票信息并剔除掉空值之后仍然存在無法對應(yīng)的問題,原因是網(wǎng)頁規(guī)則有變動,
所以一旦出現(xiàn)這種情況需要使用新的匹配規(guī)則,否則會數(shù)據(jù)會出錯(不會報錯,但信息對應(yīng)會錯誤)
'''
if len(ticket) != len(price):
ticket = ticket_ret.xpath(
'//table[@class="ticket-table"]/tbody[@class="tkt-bg-gray"]//a[@class="ticket-title "]/text()')
price = ticket_ret.xpath('//table[@class="ticket-table"]//strong[@class="ttd-fs-24"]/text()')
while True:
try:
ticket.remove(' ')
except:
break
while True:
try:
price.remove(' ')
except:
break
print(ticket)
print(price)
ticket_dict = dict(zip(ticket, price))
print(ticket_dict)
if __name__ == '__main__':
jy_jd = Jy_jd()
jy_jd.get_url()
3.6效果圖
文章來源地址http://www.zghlxwxcb.cn/news/detail-849455.html
到了這里,關(guān)于【旅游景點項目日記 | 第二篇】基于Python中的Selenium爬取攜程旅游網(wǎng)景點詳細(xì)數(shù)據(jù)的文章就介紹完了。如果您還想了解更多內(nèi)容,請在右上角搜索TOY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!