一、數(shù)據(jù)集介紹
user_data.csv是一份用戶行為數(shù)據(jù),時(shí)間區(qū)間為2017-11-25到2017-12-03,總計(jì)29132493條記錄,大小為1.0G,包含5個(gè)字段。數(shù)據(jù)集的每一行表示一條用戶行為,由用戶ID、商品ID、商品類目ID、行為類型和時(shí)間戳組成,并以逗號分隔。關(guān)于數(shù)據(jù)集中每一列的詳細(xì)描述如下:
用戶行為類型共有四種,它們分別是:
二、數(shù)據(jù)處理
1. 數(shù)據(jù)導(dǎo)入
將數(shù)據(jù)加載到hive,然后通過hive對數(shù)據(jù)進(jìn)行處理。
(1)上傳new_data.csv文件至虛擬機(jī)
(2)創(chuàng)建user_db數(shù)據(jù)庫
create database user_db;
(3)創(chuàng)建user_data表
create table user_data(
user_id string,
item_id string,
category_id string,
behavior_type string,
create_time int)
row format delimited fields terminated by ','
lines terminated by '\n';
(4)將new_data.csv文件中的數(shù)據(jù)導(dǎo)入到user_data表中
load data local inpath '/root/hive/user_data.csv' into table user_data;
2. 數(shù)據(jù)清洗
數(shù)據(jù)處理主要包括:刪除重復(fù)值,時(shí)間戳格式化,刪除異常值。
- 創(chuàng)建user_data_new表,為其添加時(shí)間字符串字段
- 數(shù)據(jù)清洗,去掉完全重復(fù)的數(shù)據(jù)
- 數(shù)據(jù)清洗,時(shí)間戳格式化成datetime。要用到from_unixtime函數(shù)。
- 查看時(shí)間是否有異常值
- 數(shù)據(jù)清洗,去掉時(shí)間異常的數(shù)據(jù)
- 查看 behavior_type 是否有異常值
(1)查看數(shù)據(jù)量
select count(1) from user_data;
(2)數(shù)據(jù)去重
insert overwrite table user_data
select user_id,item_id,category_id,behavior_type,create_time
from user_data
group by user_id,item_id,category_id,behavior_type,create_time;
可以看到有11條重復(fù)數(shù)據(jù),已經(jīng)去除。
(3)創(chuàng)建user_data_new表,為其添加時(shí)間字符串字段
create table user_data_new(
user_id string,
item_id string,
category_id string,
behavior_type string,
datetime string
)row format delimited fields terminated by ','
lines terminated by '\n';
(4)時(shí)間格式轉(zhuǎn)換
insert overwrite table user_data_new
select user_id,item_id,category_id,behavior_type,from_unixtime(create_time,'yyyy-MM-dd HH:mm:ss')
from user_data;
(5)查看時(shí)間異常值
select date(datetime) as day from user_data_new group by date(datetime) order by day;
(6)去除時(shí)間異常值
insert overwrite table user_data_new
select user_id,item_id,category_id,behavior_type,datetime
from user_data_new
where cast(datetime as date) between '2017-11-25' and '2017-12-03';
(9)查看behavior_type是否有異常值
select behavior_type from user_data_new group by behavior_type;
三、數(shù)據(jù)分析可視化
1. 用戶流量及購物情況
(1)總訪問量PV,總用戶量UV
select sum(case when behavior_type='pv' then 1 else 0 end) as pv,
count(distinct user_id) as uv
from user_data_new;
(2)日均訪問量,日均用戶量
① 統(tǒng)計(jì)日均訪問量,日均用戶量,并加工到day_pv_uv表中
create table day_pv_uv as
select cast(datetime as date) as day,
sum(case when behavior_type='pv' then 1 else 0 end) as pv,
count(distinct user_id) as uv
from user_data_new
group by cast(datetime as date)
order by day;
② 將得到的數(shù)據(jù)通過sqoop遷移至mysql
- 在mysql中創(chuàng)建數(shù)據(jù)庫和表
create table day_pv_uv (day date,pv int(20),uv int(20));
- sqoop數(shù)據(jù)遷移
bin/sqoop export \
--connect jdbc:mysql://hadoop01:3306/user_db \
--username root \
--password Guo_2001 \
--table day_pv_uv \
--fields-terminated-by '\001' \
--export-dir '/user/hive/warehouse/user_db.db/day_pv_uv' \
--num-mappers 1
- 查看遷移后的數(shù)據(jù)
③ 利用python讀取mysql數(shù)據(jù)并可視化 - pymysql讀取數(shù)據(jù)
import pymysql
# 讀取mysql數(shù)據(jù)
daylist = []
pvlist = []
uvlist = []
conn = pymysql.connect(host='192.168.20.128',
port=3306,
user='root',
password='Guo_2001',
db='user_db',
charset='utf8')
cursor = conn.cursor()
try:
sql_name = """ SELECT day FROM day_pv_uv """
cursor.execute(sql_name)
days = cursor.fetchall()
for i in range(0,len(days)):
daylist.append(days[i][0])
# print(daylist)
sql_num = """ SELECT pv FROM day_pv_uv """
cursor.execute(sql_num)
pvs = cursor.fetchall()
for i in range(0,len(pvs)):
pvlist.append(pvs[i][0])
# print(pvlist)
sql_num = """ SELECT uv FROM day_pv_uv """
cursor.execute(sql_num)
uvs = cursor.fetchall()
for i in range(0,len(uvs)):
uvlist.append(uvs[i][0])
# print(uvlist)
except:
print("未查詢到數(shù)據(jù)!")
conn.rollback()
finally:
conn.close()
- pyecharts可視化
import pyecharts.options as opts
from pyecharts.charts import Bar, Line
bar = (
Bar(init_opts=opts.InitOpts(width="1100px", height="600px"))
.set_global_opts(title_opts=opts.TitleOpts(title="每日訪問情況"))
.add_xaxis(xaxis_data=daylist)
.add_yaxis(
series_name="pv",
y_axis=pvlist,
label_opts=opts.LabelOpts(is_show=False),
)
.add_yaxis(
series_name="uv",
y_axis=uvlist,
label_opts=opts.LabelOpts(is_show=False),
)
.set_global_opts(
tooltip_opts=opts.TooltipOpts(
is_show=True, trigger="axis", axis_pointer_type="cross"
),
xaxis_opts=opts.AxisOpts(
name='date',
name_location='middle',
name_gap=30,
name_textstyle_opts=opts.TextStyleOpts(
font_family='Times New Roman',
font_size=16, # 標(biāo)簽字體大小
)),
yaxis_opts=opts.AxisOpts(
type_="value",
axislabel_opts=opts.LabelOpts(formatter="{value}"),
axistick_opts=opts.AxisTickOpts(is_show=True),
splitline_opts=opts.SplitLineOpts(is_show=True),
)
)
)
bar.render("折線圖-柱狀圖多維展示.html")
(3)每個(gè)用戶的購物情況,加工到 user_behavior_count表中
create table user_behavior_count as
select user_id,
sum(case when behavior_type='pv' then 1 else 0 end) as pv,
sum(case when behavior_type='fav' then 1 else 0 end) as fav,
sum(case when behavior_type='cart' then 1 else 0 end) as cart,
sum(case when behavior_type='buy' then 1 else 0 end) as buy
from user_data_new
group by user_id;
(4)統(tǒng)計(jì)復(fù)購率
復(fù)購率:產(chǎn)生兩次或兩次以上購買的用戶占購買用戶的比例
select sum(case when buy>1 then 1 else 0 end)/sum(case when buy>0 then 1 else 0 end)
from user_behavior_count;
可以看到復(fù)購率為0.65,還是不錯(cuò)的。
2. 用戶行為轉(zhuǎn)化率
(1)統(tǒng)計(jì)各環(huán)節(jié)轉(zhuǎn)化率
點(diǎn)擊/(加購物車+收藏)/購買,各環(huán)節(jié)轉(zhuǎn)化率
select a.pv,
a.fav,
a.cart,
a.fav + a.cart as `fav+cart`,
a.buy,
round((a.fav + a.cart) / a.pv, 4) as pv2favcart,
round(a.buy / (a.fav + a.cart), 4) as favcart2buy,
round(a.buy / a.pv, 4) as pv2buy
from(
select sum(pv) as pv,
sum(fav) as fav,
sum(cart) as cart,
sum(buy) as buy
from user_behavior_count
) as a;
(2)用戶行為轉(zhuǎn)化漏斗可視化
從漏斗圖中可以看到,收藏和加購物車的用戶行為是最多的,而購買最少,也符合實(shí)際。
3. 用戶行為習(xí)慣
(1)一天的活躍時(shí)段分布
① 統(tǒng)計(jì)每天24小時(shí)內(nèi)的行為數(shù)據(jù),并加工到hour_behavior表中
create table hour_behavior as
select hour(datetime) as hour,
sum(case when behavior_type = 'pv' then 1 else 0 end) as pv,
sum(case when behavior_type = 'fav' then 1 else 0 end) as fav,
sum(case when behavior_type = 'cart' then 1 else 0 end) as cart,
sum(case when behavior_type = 'buy' then 1 else 0 end) as buy
from user_data_new
group by hour(datetime)
order by hour;
② 將得到的數(shù)據(jù)通過sqoop遷移至mysql
- 在mysql中創(chuàng)建表
create table hour_behavior (
hour int(20),
pv int(20),
fav int(20),
cart int(20),
buy int(20)
);
- sqoop數(shù)據(jù)遷移
bin/sqoop export \
--connect jdbc:mysql://hadoop01:3306/user_db \
--username root \
--password Guo_2001 \
--table hour_behavior \
--fields-terminated-by '\001' \
--export-dir '/user/hive/warehouse/user_db.db/hour_behavior' \
--num-mappers 1
- 查看遷移后的數(shù)據(jù)
③ 利用python讀取mysql數(shù)據(jù)并可視化 - pymysql讀取數(shù)據(jù)
import pymysql
# 讀取mysql數(shù)據(jù)
hourlist = []
pvlist = []
favlist = []
cartlist = []
buylist = []
conn = pymysql.connect(host='192.168.20.128',
port=3306,
user='root',
password='Guo_2001',
db='user_db',
charset='utf8')
cursor = conn.cursor()
try:
sql_name = """ SELECT hour FROM hour_behavior """
cursor.execute(sql_name)
hours = cursor.fetchall()
for i in range(0,len(hours)):
hourlist.append(hours[i][0])
sql_num = """ SELECT pv FROM hour_behavior """
cursor.execute(sql_num)
pvs = cursor.fetchall()
for i in range(0,len(pvs)):
pvlist.append(pvs[i][0])
sql_num = """ SELECT fav FROM hour_behavior """
cursor.execute(sql_num)
favs = cursor.fetchall()
for i in range(0,len(favs)):
favlist.append(favs[i][0])
sql_num = """ SELECT cart FROM hour_behavior """
cursor.execute(sql_num)
carts = cursor.fetchall()
for i in range(0,len(carts)):
cartlist.append(carts[i][0])
sql_num = """ SELECT buy FROM hour_behavior """
cursor.execute(sql_num)
buys = cursor.fetchall()
for i in range(0,len(buys)):
buylist.append(buys[i][0])
except:
print("未查詢到數(shù)據(jù)!")
conn.rollback()
finally:
conn.close()
- pyecharts可視化
from pyecharts.charts import Line
# 堆疊柱狀圖繪制
line=Line()
line.add_xaxis(hourlist)
line.add_yaxis('點(diǎn)贊數(shù)',pvlist,stack="stack1",label_opts=opts.LabelOpts(is_show=False))
line.add_yaxis('收藏?cái)?shù)',favlist,stack="stack1",label_opts=opts.LabelOpts(is_show=False))
line.add_yaxis('加購物車數(shù)',cartlist,stack="stack1",label_opts=opts.LabelOpts(is_show=False))
line.add_yaxis('購買數(shù)',buylist,stack="stack1",label_opts=opts.LabelOpts(is_show=False))
line.set_global_opts(title_opts=opts.TitleOpts(title="用戶一天24小時(shí)的活躍時(shí)段分布"))
line.render_notebook()
從圖中可以看到一天24小時(shí)中,13和14時(shí)用戶處于最活躍的狀態(tài),而19-21時(shí)用戶的活躍次數(shù)并不高,當(dāng)然此時(shí)也處于睡覺時(shí)間,符合實(shí)際情況。
(2)一周用戶的活躍分布
① 統(tǒng)計(jì)一周七天內(nèi)的行為數(shù)據(jù),并加工到week_behavior表中
create table week_behavior as
select pmod(datediff(datetime, '1920-01-01') - 3, 7) as weekday,
sum(case when behavior_type = 'pv' then 1 else 0 end) as pv,
sum(case when behavior_type = 'fav' then 1 else 0 end) as fav,
sum(case when behavior_type = 'cart' then 1 else 0 end) as cart,
sum(case when behavior_type = 'buy' then 1 else 0 end) as buy
from user_data_new
where date(datetime) between '2017-11-27' and '2017-12-03'
group by pmod(datediff(datetime, '1920-01-01') - 3, 7)
order by weekday;
② 將得到的數(shù)據(jù)通過sqoop遷移至mysql
- 在mysql中創(chuàng)建表
create table week_behavior (
weekday int(20),
pv int(20),
fav int(20),
cart int(20),
buy int(20)
);
- sqoop數(shù)據(jù)遷移
bin/sqoop export \
--connect jdbc:mysql://hadoop01:3306/user_db \
--username root \
--password Guo_2001 \
--table week_behavior \
--fields-terminated-by '\001' \
--export-dir '/user/hive/warehouse/user_db.db/week_behavior' \
--num-mappers 1
文章來源:http://www.zghlxwxcb.cn/news/detail-470325.html
- 查看遷移后的數(shù)據(jù)
③ 利用python讀取mysql數(shù)據(jù)并可視化 - pymysql讀取數(shù)據(jù)
import pymysql
# 讀取mysql數(shù)據(jù)
weeklist = []
pvlist = []
favlist = []
cartlist = []
buylist = []
conn = pymysql.connect(host='192.168.20.128',
port=3306,
user='root',
password='Guo_2001',
db='user_db',
charset='utf8')
cursor = conn.cursor()
try:
sql_name = """ SELECT weekday FROM week_behavior """
cursor.execute(sql_name)
weeks = cursor.fetchall()
for i in range(0,len(weeks)):
weeklist.append(weeks[i][0])
sql_num = """ SELECT pv FROM week_behavior """
cursor.execute(sql_num)
pvs = cursor.fetchall()
for i in range(0,len(pvs)):
pvlist.append(pvs[i][0])
sql_num = """ SELECT fav FROM week_behavior """
cursor.execute(sql_num)
favs = cursor.fetchall()
for i in range(0,len(favs)):
favlist.append(favs[i][0])
sql_num = """ SELECT cart FROM week_behavior """
cursor.execute(sql_num)
carts = cursor.fetchall()
for i in range(0,len(carts)):
cartlist.append(carts[i][0])
sql_num = """ SELECT buy FROM week_behavior """
cursor.execute(sql_num)
buys = cursor.fetchall()
for i in range(0,len(buys)):
buylist.append(buys[i][0])
except:
print("未查詢到數(shù)據(jù)!")
conn.rollback()
finally:
conn.close()
- pyecharts可視化
from pyecharts.charts import Line
# 堆疊這些線圖繪制
line=Line()
line.add_xaxis(weeklist)
line.add_yaxis('點(diǎn)贊數(shù)',pvlist,stack="stack1",label_opts=opts.LabelOpts(is_show=False))
line.add_yaxis('收藏?cái)?shù)',favlist,stack="stack1",label_opts=opts.LabelOpts(is_show=False))
line.add_yaxis('加購物車數(shù)',cartlist,stack="stack1",label_opts=opts.LabelOpts(is_show=False))
line.add_yaxis('購買數(shù)',buylist,stack="stack1",label_opts=opts.LabelOpts(is_show=False))
line.set_global_opts(title_opts=opts.TitleOpts(title="一周用戶的活躍分布"))
line.render_notebook()
從圖中可以看到,在一周中,周日是用戶最活躍的一天,休息日不管是從點(diǎn)贊量、收藏量、加購物車量還是購買量來看都是處于最高的位置。文章來源地址http://www.zghlxwxcb.cn/news/detail-470325.html
到了這里,關(guān)于【Hive+MySQL+Python】淘寶用戶購物行為數(shù)據(jù)分析項(xiàng)目的文章就介紹完了。如果您還想了解更多內(nèi)容,請?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!