DEBUG = False
!pip install polars
!pip install snoop
from collections import defaultdict, Counter
import gc
from snoop import pp
import polars as pl
import pandas as pd
import numpy as np
import random
from polars.testing import assert_frame_equal, assert_series_equal
from datetime import datetime
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_colwidth', None)
cfg = pl.Config.restore_defaults()
pl.Config.set_tbl_rows(50)
pl.Config.set_fmt_str_lengths(1000)
if DEBUG: fraction_of_sessions_to_use = 0.00001
else: fraction_of_sessions_to_use = 1
train_ms = pl.scan_parquet('/kaggle/input/otto-radek-style-polars/train_ms.parquet')
test_ms = pl.scan_parquet('/kaggle/input/otto-radek-style-polars/test_ms.parquet')
sample_sub = pl.scan_csv('/kaggle/input/otto-recommender-system/sample_submission.csv')
子集訓(xùn)練和測(cè)試并將它們連接在一起
%%time
lucky_sessions_train = (
train_ms
.select([
pl.col('session').unique().sample(frac=fraction_of_sessions_to_use, seed=42)
])
.collect()
.to_series().to_list()
)
lucky_sessions_test = (
test_ms
.select([
pl.col('session').unique().sample(frac=fraction_of_sessions_to_use, seed=42)
])
.collect()
.to_series().to_list()
)
subset_of_train = (
train_ms
.filter(pl.col('session').is_in(lucky_sessions_train))
)
subset_of_test = (
test_ms
.filter(pl.col('session').is_in(lucky_sessions_test))
)
subsets = pl.concat([subset_of_train, subset_of_test]).collect()
sessions = subsets.select('session').unique().to_series().to_list()
pp(lucky_sessions_train[:3], len(lucky_sessions_train), lucky_sessions_test[:3], len(lucky_sessions_test),
subset_of_train.collect().height, subset_of_test.collect().height, subsets.height)
創(chuàng)建共同訪問矩陣
共同訪問矩陣只是探索以下想法/問題的名稱
一個(gè)輔助設(shè)備/產(chǎn)品與同一會(huì)話中或所有會(huì)話中的其他輔助設(shè)備/產(chǎn)品之間是否存在任何關(guān)系?
是否有一些輔助工具與某些輔助工具更相似,而與其他輔助工具更不同?
當(dāng)查看此輔助工具時(shí),是否有可能,某些輔助工具比其他輔助工具更有可能被點(diǎn)擊/購物車/訂購?
我們能否為每個(gè)會(huì)話將輔助工具配對(duì)在一起并計(jì)算配對(duì)的次數(shù)?
由于一個(gè)輔助(例如,“122”)可以有許多配對(duì)伙伴,通過計(jì)算配對(duì)(“122”,配對(duì)伙伴)的出現(xiàn)次數(shù),我們能否找到援助“122”最常見的配對(duì)伙伴?
下一次點(diǎn)擊、購物車或訂單可能是測(cè)試會(huì)話的最后一次輔助(或所有輔助)最常見的配對(duì)伙伴嗎?
建立共同訪問矩陣的第一個(gè)挑戰(zhàn)是如何配對(duì)
在拉狄克的筆記本中,配對(duì)邏輯如下
僅使用每個(gè)會(huì)話的最后 30 個(gè)輔助工具相互配對(duì)
刪除相同伙伴的對(duì)
保留一天內(nèi)右伴侶在左伴侶之后的配對(duì)
我們可以調(diào)整配對(duì)邏輯來改變我們的共同訪問矩陣
%%time
next_AIDs = defaultdict(Counter)
chunk_size = 300000
for i in range(0, len(sessions), chunk_size):
current_chunk = (
subsets
.filter(pl.col('session').is_between(sessions[i], sessions[np.min([i+chunk_size-1, len(sessions)-1])], closed='both'))
.unique() # no duplicates
.groupby('session').tail(30)
)
current_chunk = (
current_chunk
.join(current_chunk, on='session', suffix='_right')
.sort(['session', 'aid', 'aid_right']) # nice view
.filter(pl.col('aid') != pl.col('aid_right')) # no need for pairs of themselves
.with_columns([
((pl.col('ts_right') - pl.col('ts'))/(24*60*60*1000)).alias('days_elapsed') # differentiate aid_right is after or before aid in days
])
.filter((pl.col('days_elapsed')>=0) & (pl.col('days_elapsed') <=1)) # only pairs whose aid_rights are after aid within 24 hrs
)
# defaultdict + Counter is super faster than pure polars solution
for aid_x, aid_y in zip(current_chunk.select('aid').to_series().to_list(), current_chunk.select('aid_right').to_series().to_list()):
next_AIDs[aid_x][aid_y] += 1
print(f'{int(np.ceil(i/chunk_size))} out of {int(np.ceil(len(sessions)/chunk_size))} - {np.min([i+chunk_size-1, len(sessions)-1])} sessions are done')
len(next_AIDs)
polars版本和pandas版本差不多,在這里看拉狄克的速度
del train_ms, subset_of_train, subsets
gc.collect()
使用共同訪問矩陣在測(cè)試會(huì)話中輔助人員少于 20 人時(shí)提供幫助候選人
拉狄克在這里向我們展示了兩件事:
如何創(chuàng)建特征(基于時(shí)間、類型和出現(xiàn)的權(quán)重)以在測(cè)試會(huì)話中選擇 20 個(gè)輔助工具
如何從共同訪問矩陣中選擇候選人?
在測(cè)試會(huì)話中為每種輔助工具選取 20 種最常見的輔助工具,并將它們放入候選列表中
從候選列表中選擇 40 種最常見的輔助工具,如果它們是新參加會(huì)話的,請(qǐng)將它們添加到測(cè)試會(huì)話的輔助工具中
然后選擇前 20 個(gè)輔助工具文章來源:http://www.zghlxwxcb.cn/news/detail-400243.html
%%time
lists_aids_types = (
test_ms
.unique() #
.groupby('session')
.agg([
pl.col('aid').list().alias('test_session_AIDs'),
pl.col('type').list().alias('test_session_types'),
])
.collect()
)
lists_aids_types.head()
%%time
labels = []
session_types = ['clicks', 'carts', 'orders']
no_data = 0
no_data_all_aids = 0
type_weight_multipliers = {0: 1, 1: 6, 2: 3}
test_session_AIDs = lists_aids_types.select('test_session_AIDs').to_series().to_list()
test_session_types = lists_aids_types.select('test_session_types').to_series().to_list()
# take each session's aids and types
for AIDs, types in zip(test_session_AIDs, test_session_types):
# if the session has more than 20 aids
if len(AIDs) >= 20:
# np.logspace: Return numbers spaced evenly on a log scale.
# `-1` is to ensure the weights ranges between [0,1]
# the weights is given to AIDs based on the time order or chronological order
weights=np.logspace(start=0.1,stop=1,num=len(AIDs),base=2, endpoint=True)-1
# create a defaultdict for this session only
# anything added into this dict will have a default value 0
# try `aids_temp[1]` and `aids_temp`
aids_temp=defaultdict(lambda: 0)
# in each sess, an aid may occur multiples in multiple types at different time,
# the line below is to take all 3 factors into account to value the importance of this aid to the session
# each unique aid and its aggregated weight are stored in a defaultdict
for aid,w,t in zip(AIDs,weights,types):
aids_temp[aid]+= w * type_weight_multipliers[t]
# let's
sorted_aids=[k for k, v in sorted(aids_temp.items(), key=lambda item: -item[1])]
# when using the polars below to replace the line above, it is actually 2 times slower
# aid = [key for (key, value) in aids_temp.items()]
# adwt = [value for (key, value) in aids_temp.items()]
# sorted_aids = (
# pl.DataFrame([aid, adwt], columns=['aid', 'weight'])
# .sort('weight', reverse=True)
# .select('aid').to_series().to_list()
# )
# take the 20 aids with the largest weights from this session as one list and append it into a new list `labels`
labels.append(sorted_aids[:20])
# when this session has less than 20 aids
else:
# reverse the order of AIDs (a list of aids of this session) and remove the duplicated aids
AIDs = list(dict.fromkeys(AIDs[::-1])) # python version
# If using this polars below to replace the line above, it is infinitely slower
# AIDs = pl.Series('aid', AIDs).unique().reverse().to_list() # polars version
# keep track of the length of new AIDs above
AIDs_len_start = len(AIDs)
candidates = []
# take each unique aid of this session, access its the 20 most common pair-partners and their counts
# insert the list of the 20 most common pair-partner aids into another list `candidates` (only a pure list )
# in the end, this `candidates` list is a lot and has many duplicates too
for AID in AIDs:
if AID in next_AIDs: candidates += [aid for aid, count in next_AIDs[AID].most_common(20)]
# take the 40 most common aids from `candidates`, and if they are already inside AIDs of this session,
# then insert them into AIDs (still a pure list because of `+`, and `append` can't do it)
AIDs += [AID for AID, cnt in Counter(candidates).most_common(40) if AID not in AIDs]
# but we still only take the first 20 aids from AIDs as this session's prediction and store it in `labels`
labels.append(AIDs[:20])
# if no candidates are generated, count 1 to `no_data`
# if candidates == []: no_data += 1 # this variable is actually not used by Radek
# keep an account of the num of aids in this session and all sessions which adding no candidates
if AIDs_len_start == len(AIDs): no_data_all_aids += 1
sample_sub.fetch().head()
創(chuàng)建提交標(biāo)簽文章來源地址http://www.zghlxwxcb.cn/news/detail-400243.html
%%time
(
pl.DataFrame({'session': lists_aids_types.select('session').to_series().to_list(),
'labels': labels})
.with_columns([
pl.col('labels').arr.eval(pl.element().cast(pl.Utf8)).arr.join(' '),
(pl.col('session')+"_clicks").alias('clicks'),
(pl.col('session')+"_carts").alias('carts'),
(pl.col('session')+"_orders").alias('orders'),
])
.select([
'session',
pl.concat_list(['clicks', 'carts', 'orders']).alias('session_type'),
'labels'
])
.explode('session_type')
.sort('session')
.select(pl.exclude('session'))
.write_csv('submission.csv')
)
print(f'Test sessions that we did not manage to extend based on the co-visitation matrix: {no_data_all_aids}')
pl.read_csv('submission.csv').shape
sample_sub.collect().shape
# from matplotlib import pyplot as plt
# plt.hist([len(l) for l in labels]);
# plt.suptitle('Distribution of predicted sequence lengths');
到了這里,關(guān)于kaggle學(xué)習(xí)筆記-otto-baseline10-實(shí)現(xiàn)拉狄克簡單共訪矩陣極坐標(biāo)的文章就介紹完了。如果您還想了解更多內(nèi)容,請(qǐng)?jiān)谟疑辖撬阉鱐OY模板網(wǎng)以前的文章或繼續(xù)瀏覽下面的相關(guān)文章,希望大家以后多多支持TOY模板網(wǎng)!