1.數(shù)據(jù)加載
import pandas as pd
dataset = pd.read_csv('SupplyChain.csv', encoding='unicode_escape')
dataset
2.查看數(shù)據(jù)情況
print(dataset.shape)
print(dataset.isnull().sum())
3.數(shù)據(jù)合并及填充
print(dataset[['Customer Fname', 'Customer Lname']])
# fistname與lastname進行合并
dataset['Customer Full Name'] = dataset['Customer Fname'] +dataset['Customer Lname']
#dataset.head()
dataset['Customer Zipcode'].value_counts()
# 查看缺失值,發(fā)現(xiàn)有3個缺失值
print(dataset['Customer Zipcode'].isnull().sum())
dataset['Customer Zipcode'] = dataset['Customer Zipcode'].fillna(0)
dataset.head()
4.查看特征字段之間相關性
import matplotlib.pyplot as plt
import seaborn as sns
# 特征字段之間相關性 熱力圖
data = dataset
plt.figure(figsize=(20,10))
# annot=True 顯示具體數(shù)字
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
# 結論:可以觀察到Product Price和Sales,Order Item Total有很高的相關性
5.聚合操作
# 基于Market進行聚合
market = data.groupby('Market')
# 基于Region進行聚合
region = data.groupby('Order Region')
plt.figure(1)
market['Sales per customer'].sum().sort_values(ascending=False).plot.bar(figsize=(12,6), title='Sales in different markets')
plt.figure(2)
region['Sales per customer'].sum().sort_values(ascending=False).plot.bar(figsize=(12,6), title='Sales in different regions')
plt.show()
# 基于Category Name進行聚類
cat = data.groupby('Category Name')
plt.figure(1)
# 不同類別的 總銷售額
cat['Sales per customer'].sum().sort_values(ascending=False).plot.bar(figsize=(12,6), title='Total sales')
plt.figure(2)
# 不同類別的 平均銷售額
cat['Sales per customer'].mean().sort_values(ascending=False).plot.bar(figsize=(12,6), title='Total sales')
plt.show()
6.時間維度上看銷售額
#data['order date (DateOrders)']
# 創(chuàng)建時間戳索引
temp = pd.DatetimeIndex(data['order date (DateOrders)'])
temp
# 取order date (DateOrders)字段中的year, month, weekday, hour, month_year
data['order_year'] = temp.year
data['order_month'] = temp.month
data['order_week_day'] = temp.weekday
data['order_hour'] = temp.hour
data['order_month_year'] = temp.to_period('M')
data.head()
# 對銷售額進行探索,按照不同時間維度 年,星期,小時,月
plt.figure(figsize=(10, 12))
plt.subplot(4, 2, 1)
df_year = data.groupby('order_year')
df_year['Sales'].mean().plot(figsize=(12, 12), title='Average sales in years')
plt.subplot(4, 2, 2)
df_day = data.groupby('order_week_day')
df_day['Sales'].mean().plot(figsize=(12, 12), title='Average sales in days per week')
plt.subplot(4, 2, 3)
df_hour = data.groupby('order_hour')
df_hour['Sales'].mean().plot(figsize=(12, 12), title='Average sales in hours per day')
plt.subplot(4, 2, 4)
df_month = data.groupby('order_month')
df_month['Sales'].mean().plot(figsize=(12, 12), title='Average sales in month per year')
plt.tight_layout()
plt.show()
# 探索商品價格與 銷售額之間的關系
data.plot(x='Product Price', y='Sales per customer')
plt.title('Relationship between Product Price and Sales per customer')
plt.xlabel('Product Price')
plt.ylabel('Sales per customer')
plt.show()
7.計算用戶RFM
# # 用戶分層 RFM
data['TotalPrice'] = data['Order Item Quantity'] * data['Order Item Total']
data[['TotalPrice', 'Order Item Quantity', 'Order Item Total']]
# 時間類型轉(zhuǎn)換
data['order date (DateOrders)'] = pd.to_datetime(data['order date (DateOrders)'])
# 統(tǒng)計最后一筆訂單的時間
data['order date (DateOrders)'].max()
# 假設我們現(xiàn)在是2018-2-1
import datetime
present = datetime.datetime(2018,2,1)
# 計算每個用戶的RFM指標
# 按照Order Customer Id進行聚合,
customer_seg = data.groupby('Order Customer Id').agg({'order date (DateOrders)': lambda x: (present-x.max()).days, 'Order Id': lambda x:len(x), 'TotalPrice': lambda x: x.sum()})
customer_seg
# 將字段名稱改成 R,F(xiàn),M
customer_seg.rename(columns={'order date (DateOrders)': 'R_Value', 'Order Id': 'F_Value', 'TotalPrice': 'M_Value'}, inplace=True)
customer_seg.head()
# 將RFM數(shù)據(jù)劃分為4個尺度
quantiles = customer_seg.quantile(q=[0.25, 0.5, 0.75])
quantiles = quantiles.to_dict()
quantiles
# R_Value越小越好 => R_Score就越大
def R_Score(a, b, c):
if a <= c[b][0.25]:
return 4
elif a <= c[b][0.50]:
return 3
elif a <= c[b][0.75]:
return 2
else:
return 1
# F_Value, M_Value越大越好
def FM_Score(a, b, c):
if a <= c[b][0.25]:
return 1
elif a <= c[b][0.50]:
return 2
elif a <= c[b][0.75]:
return 3
else:
return 4
# 新建R_Score字段,用于將R_Value => [1,4]
customer_seg['R_Score'] = customer_seg['R_Value'].apply(R_Score, args=("R_Value", quantiles))
# 新建F_Score字段,用于將F_Value => [1,4]
customer_seg['F_Score'] = customer_seg['F_Value'].apply(FM_Score, args=("F_Value", quantiles))
# 新建M_Score字段,用于將R_Value => [1,4]
customer_seg['M_Score'] = customer_seg['M_Value'].apply(FM_Score, args=("M_Value", quantiles))
customer_seg.head()
# 計算RFM用戶分層
def RFM_User(df):
if df['M_Score'] > 2 and df['F_Score'] > 2 and df['R_Score'] > 2:
return '重要價值用戶'
if df['M_Score'] > 2 and df['F_Score'] <= 2 and df['R_Score'] > 2:
return '重要發(fā)展用戶'
if df['M_Score'] > 2 and df['F_Score'] > 2 and df['R_Score'] <= 2:
return '重要保持用戶'
if df['M_Score'] > 2 and df['F_Score'] <= 2 and df['R_Score'] <= 2:
return '重要挽留用戶'
if df['M_Score'] <= 2 and df['F_Score'] > 2 and df['R_Score'] > 2:
return '一般價值用戶'
if df['M_Score'] <= 2 and df['F_Score'] <= 2 and df['R_Score'] > 2:
return '一般發(fā)展用戶'
if df['M_Score'] <= 2 and df['F_Score'] > 2 and df['R_Score'] <= 2:
return '一般保持用戶'
if df['M_Score'] <= 2 and df['F_Score'] <= 2 and df['R_Score'] <= 2:
return '一般挽留用戶'
customer_seg['Customer_Segmentation'] = customer_seg.apply(RFM_User, axis=1)
customer_seg
文章來源:http://www.zghlxwxcb.cn/news/detail-790409.html
8.數(shù)據(jù)保存存儲
(1).to_csv
customer_seg.to_csv('supply_chain_rfm_result.csv', index=False)
(1).to_pickle
# 數(shù)據(jù)預處理后,將處理后的數(shù)據(jù)進行保存
data.to_pickle('data.pkl')
參考資料:開課吧文章來源地址http://www.zghlxwxcb.cn/news/detail-790409.html
到了這里,關于Python綜合數(shù)據(jù)分析_RFM用戶分層模型的文章就介紹完了。如果您還想了解更多內(nèi)容,請在右上角搜索TOY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關文章,希望大家以后多多支持TOY模板網(wǎng)!