前言
數(shù)據(jù)采集的步驟是固定:
- 發(fā)送請求, 模擬瀏覽器對于url地址發(fā)送請求
- 獲取數(shù)據(jù), 獲取網(wǎng)頁數(shù)據(jù)內(nèi)容 --> 請求那個鏈接地址, 返回服務(wù)器響應(yīng)數(shù)據(jù)
- 解析數(shù)據(jù), 提取我們需要的數(shù)據(jù)內(nèi)容
- 保存數(shù)據(jù), 保存本地文件
所需模塊
win + R 輸入cmd 輸入安裝命令 pip install 模塊名 (如果你覺得安裝速度比較慢, 你可以切換國內(nèi)鏡像源)
# 數(shù)據(jù)請求模塊 第三方模塊 需要安裝 pip install requests
import requests
# 數(shù)據(jù)解析模塊 第三方模塊 需要安裝 pip install parsel
import parsel
# 導(dǎo)入csv模塊 內(nèi)置模塊 不需要安裝
import csv # 固定模板
# 導(dǎo)入pandas模塊
import pandas as pd
二手房源數(shù)據(jù)獲取
請求數(shù)據(jù)
# 模擬瀏覽器
headers = {
# 用戶代理 表示瀏覽器基本身份信息
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
# 請求鏈接
url = 'https://cs.lianjia.com/ershoufang'
# 發(fā)送請求
response = requests.get(url=url, headers=headers)
# 輸出內(nèi)容 <Response [200]> 響應(yīng)對象 表示請求成功
print(response)
解析數(shù)據(jù)
我們這次選用css選擇器: 根據(jù)標(biāo)簽屬性提取數(shù)據(jù)內(nèi)容
- 獲取所有房源所在li標(biāo)簽
selector = parsel.Selector(response.text) # 選擇器對象
# 獲取所有房源所在li標(biāo)簽
lis = selector.css('.sellListContent li .info')
- for循環(huán)遍歷
for li in lis:
title = li.css('.title a::text').get() # 標(biāo)題
area_info = li.css('.positionInfo a::text').getall() # 區(qū)域信息
area_1 = area_info[0] # 小區(qū)
area_2 = area_info[1] # 區(qū)域
totalPrice = li.css('.totalPrice span::text').get() # 總價
unitPrice = li.css('.unitPrice span::text').get().replace('元/平', '') # 單價
houseInfo = li.css('.houseInfo::text').get().split(' | ') # 房源信息
HouseType = houseInfo[0] # 戶型
HouseArea = houseInfo[1].replace('平米', '') # 面積
HouseFace = houseInfo[2] # 朝向
HouseInfo_1 = houseInfo[3] # 裝修
fool = houseInfo[4] # 樓層
HouseInfo_2 = houseInfo[-1] # 建筑結(jié)構(gòu)
href = li.css('.title a::attr(href)').get() # 詳情頁
dit = {
'標(biāo)題': title,
'小區(qū)': area_1,
'區(qū)域': area_2,
'總價': totalPrice,
'單價': unitPrice,
'戶型': HouseType,
'面積': HouseArea,
'朝向': HouseFace,
'裝修': HouseInfo_1,
'樓層': fool,
'年份': date,
'建筑結(jié)構(gòu)': HouseInfo_2,
'詳情頁': href,
}
print(dit)
保存數(shù)據(jù)
f = open('二手房.csv', mode='w', encoding='utf-8', newline='')
csv_writer = csv.DictWriter(f, fieldnames=[
'標(biāo)題',
'小區(qū)',
'區(qū)域',
'總價',
'單價',
'戶型',
'面積',
'朝向',
'裝修',
'樓層',
'年份',
'建筑結(jié)構(gòu)',
'詳情頁',
])
csv_writer.writeheader()
接下來就是數(shù)據(jù)可視化
二手房源戶型分布
from pyecharts import options as opts
from pyecharts.charts import Pie
from pyecharts.faker import Faker
c = (
Pie()
.add(
"",
[
list(z)
for z in zip(house_type, house_num)
],
center=["40%", "50%"],
)
.set_global_opts(
title_opts=opts.TitleOpts(title="二手房源戶型分布"),
legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%", orient="vertical"),
)
.set_series_opts(label_opts=opts.LabelOpts(formatter=": {c}"))
# .render("pie_scroll_legend.html")
)
c.load_javascript()
二手房源朝向分布
face_type = df['朝向'].value_counts().index.to_list()
face_num = df['朝向'].value_counts().to_list()
c = (
Pie()
.add(
"",
[
list(z)
for z in zip(face_type, face_num)
],
center=["40%", "50%"],
)
.set_global_opts(
title_opts=opts.TitleOpts(title="二手房源朝向分布"),
legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%", orient="vertical"),
)
.set_series_opts(label_opts=opts.LabelOpts(formatter=": {c}"))
# .render("pie_scroll_legend.html")
)
c.render_notebook()
二手房源裝修分布
face_type = df['裝修'].value_counts().index.to_list()
face_num = df['裝修'].value_counts().to_list()
c = (
Pie()
.add(
"",
[
list(z)
for z in zip(face_type, face_num)
],
center=["40%", "50%"],
)
.set_global_opts(
title_opts=opts.TitleOpts(title="二手房源裝修分布"),
legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%", orient="vertical"),
)
.set_series_opts(label_opts=opts.LabelOpts(formatter=": {c}"))
# .render("pie_scroll_legend.html")
)
c.render_notebook()
二手房源年份分布
face_type = df['年份'].value_counts().index.to_list()
face_num = df['年份'].value_counts().to_list()
c = (
Pie()
.add(
"",
[
list(z)
for z in zip(face_type, face_num)
],
center=["40%", "50%"],
)
.set_global_opts(
title_opts=opts.TitleOpts(title="二手房源年份分布"),
legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%", orient="vertical"),
)
.set_series_opts(label_opts=opts.LabelOpts(formatter=": {c}"))
# .render("pie_scroll_legend.html")
)
c.render_notebook()
二手房源建筑結(jié)構(gòu)分布
face_type = df['建筑結(jié)構(gòu)'].value_counts().index.to_list()
face_num = df['建筑結(jié)構(gòu)'].value_counts().to_list()
c = (
Pie()
.add(
"",
[
list(z)
for z in zip(face_type, face_num)
],
center=["40%", "50%"],
)
.set_global_opts(
title_opts=opts.TitleOpts(title="二手房源建筑結(jié)構(gòu)分布"),
legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%", orient="vertical"),
)
.set_series_opts(label_opts=opts.LabelOpts(formatter=": {c}"))
# .render("pie_scroll_legend.html")
)
c.render_notebook()
各大區(qū)域房價平均價
avg_salary = df.groupby('區(qū)域')['總價'].mean()
CityType = avg_salary.index.tolist()
CityNum = [int(a) for a in avg_salary.values.tolist()]
from pyecharts.charts import Bar
# 創(chuàng)建柱狀圖實(shí)例
c = (
Bar()
.add_xaxis(CityType)
.add_yaxis("", CityNum)
.set_global_opts(
title_opts=opts.TitleOpts(title="各大區(qū)域房價平均價"),
visualmap_opts=opts.VisualMapOpts(
dimension=1,
pos_right="5%",
max_=30,
is_inverse=True,
),
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=45)) # 設(shè)置X軸標(biāo)簽旋轉(zhuǎn)角度為45度
)
.set_series_opts(
label_opts=opts.LabelOpts(is_show=False),
markline_opts=opts.MarkLineOpts(
data=[
opts.MarkLineItem(type_="min", name="最小值"),
opts.MarkLineItem(type_="max", name="最大值"),
opts.MarkLineItem(type_="average", name="平均值"),
]
),
)
)
c.render_notebook()
各大區(qū)域房價單價平均價格
import pandas as pd
from pyecharts.charts import Bar
import pyecharts.options as opts
# 清理數(shù)據(jù)并將'單價'列轉(zhuǎn)換為整數(shù)類型
df['單價'] = df['單價'].str.replace(',', '').astype(int)
# 計算平均價
avg_salary = df.groupby('區(qū)域')['單價'].mean()
# 獲取城市類型和城市平均價格
CityType = avg_salary.index.tolist()
CityNum = [int(a) for a in avg_salary.values.tolist()]
# 創(chuàng)建柱狀圖實(shí)例
c = (
Bar()
.add_xaxis(CityType)
.add_yaxis("", CityNum)
.set_global_opts(
title_opts=opts.TitleOpts(title="各大區(qū)域房價單價平均價格"),
visualmap_opts=opts.VisualMapOpts(
dimension=1,
pos_right="5%",
max_=30,
is_inverse=True,
),
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=45)) # 設(shè)置X軸標(biāo)簽旋轉(zhuǎn)角度為45度
)
.set_series_opts(
label_opts=opts.LabelOpts(is_show=False),
markline_opts=opts.MarkLineOpts(
data=[
opts.MarkLineItem(type_="min", name="最小值"),
opts.MarkLineItem(type_="max", name="最大值"),
opts.MarkLineItem(type_="average", name="平均值"),
]
),
)
)
# 在Notebook中顯示柱狀圖
c.render_notebook()
【全網(wǎng)最全400個python實(shí)戰(zhàn)項(xiàng)目】2023最新版 暑期禁止擺爛!練完開啟Python兼職之旅~文章來源:http://www.zghlxwxcb.cn/news/detail-719034.html
400個實(shí)戰(zhàn)案例已經(jīng)為大家準(zhǔn)備好 確定不看看?
評論或者私信即可獲取~文章來源地址http://www.zghlxwxcb.cn/news/detail-719034.html
到了這里,關(guān)于Python教你一招,爬取鏈家二手房并做數(shù)據(jù)可視化分析的文章就介紹完了。如果您還想了解更多內(nèi)容,請在右上角搜索TOY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!