developer.chat
22 September 2024
category
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import gc
from sklearn.model_selection import cross_val_score,GridSearchCV,train_test_split
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.metrics import roc_curve,roc_auc_score,classification_report,mean_squared_error,accuracy_score
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier,BaggingClassifier,VotingClassifier,AdaBoostClassifier
In this task, you will be asked to predict the chances of a user listening to a song repetitively after the first observable listening event within a time window was triggered. If there are recurring listening event(s) triggered within a month after the user’s very first observable listening event, its target is marked 1, and 0 otherwise in the training set. The same rule applies to the testing set. KKBOX provides a training data set consists of information of the first observable listening event for each unique user-song pair within a specific time duration. Metadata of each unique user and song pair is also provided. The use of public data to increase the level of accuracy of your prediction is encouraged. The train and the test data are selected from users listening history in a given time period. Note that this time period is chosen to be before the WSDM-KKBox Churn Prediction time period. The train and test sets are split based on time, and the split of public/private are based on unique user/song pairs. Tables train.csv msno: user id song_id: song id source_system_tab: the name of the tab where the event was triggered. System tabs are used to categorize KKBOX mobile apps functions. For example, tab my library contains functions to manipulate the local storage, and tab search contains functions relating to search. source_screen_name: name of the layout a user sees. source_type: an entry point a user first plays music on mobile apps. An entry point could be album, online-playlist, song .. etc. target: this is the target variable. target=1 means there are recurring listening event(s) triggered within a month after the user’s very first observable listening event, target=0 otherwise . test.csv id: row id (will be used for submission) msno: user id song_id: song id source_system_tab: the name of the tab where the event was triggered. System tabs are used to categorize KKBOX mobile apps functions. For example, tab my library contains functions to manipulate the local storage, and tab search contains functions relating to search. source_screen_name: name of the layout a user sees. source_type: an entry point a user first plays music on mobile apps. An entry point could be album, online-playlist, song .. etc. sample_submission.csv sample submission file in the format that we expect you to submit id: same as id in test.csv target: this is the target variable. target=1 means there are recurring listening event(s) triggered within a month after the user’s very first observable listening event, target=0 otherwise . songs.csv The songs. Note that data is in unicode. song_id song_length: in ms genre_ids: genre category. Some songs have multiple genres and they are separated by | artist_name composer lyricist language members.csv user information. msno city bd: age. Note: this column has outlier values, please use your judgement. gender registered_via: registration method registration_init_time: format %Y%m%d expiration_date: format %Y%m%d song_extra_info.csv song_id song name - the name of the song. isrc - International Standard Recording Code, theoretically can be used as an identity of a song. However, what worth to note is, ISRCs generated from providers have not been officially verified; therefore the information in ISRC, such as country code and reference year, can be misleading/incorrect. Multiple songs could share one ISRC since a single recording could be re-published several times.
In [2]:
import lightgbm as lgb
from sklearn.metrics import precision_recall_curve,roc_auc_score,classification_report,roc_curve
from tqdm import tqdm
In [3]:
from subprocess import check_output
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
songs = pd.read_csv('../input/songs.csv')
members = pd.read_csv('../input/members.csv')
sample = pd.read_csv('../input/sample_submission.csv')
In [4]:
train.head()
Out[4]:
In [5]:
test.head()
Out[5]:
In [6]:
songs.head()
Out[6]:
In [7]:
members.head()
Out[7]:
In [8]:
sample.head()
members.shape
train.info()
print("\n")
songs.info()
print("\n")
members.info()
In [9]:
plt.figure(figsize=(20,15))
sns.set(font_scale=2)
sns.countplot(x='source_type',hue='source_type',data=train)
sns.set(style="darkgrid")
plt.xlabel('source types',fontsize=30)
plt.ylabel('count',fontsize=30)
plt.xticks(rotation='45')
plt.title('Count plot source types for listening music',fontsize=30)
plt.tight_layout()
In [10]:
plt.figure(figsize=(20,15))
sns.set(font_scale=2)
sns.countplot(y='source_screen_name',data=train,facecolor=(0,0,0,0),linewidth=5,edgecolor=sns.color_palette('dark',3))
sns.set(style="darkgrid")
plt.xlabel('source types',fontsize=30)
plt.ylabel('count',fontsize=30)
plt.xticks(rotation='45')
plt.title('Count plot for which screen using ',fontsize=30)
plt.tight_layout()
In [11]:
plt.figure(figsize=(20,15))
sns.set(font_scale=2)
sns.countplot(x='source_system_tab',hue='source_system_tab',data=train)
sns.set(style="darkgrid")
plt.xlabel('source types',fontsize=30)
plt.ylabel('count',fontsize=30)
plt.xticks(rotation='45')
plt.title('Count plot for system tab there are using',fontsize=30)
plt.tight_layout()
In [12]:
import matplotlib as mpl
mpl.rcParams['font.size'] = 40.0
labels = ['Male','Female']
plt.figure(figsize = (12, 12))
sizes = pd.value_counts(members.gender)
patches, texts, autotexts = plt.pie(sizes,
labels=labels, autopct='%.0f%%',
shadow=False, radius=1,startangle=90)
for t in texts:
t.set_size('smaller')
plt.legend()
plt.show()
In [13]:
import matplotlib.pyplot as plt
mpl.rcParams['font.size'] = 40.0
plt.figure(figsize = (20, 20))
# Make data: I have 3 groups and 7 subgroups
group_names=['explore','my library','search','discover','radio','listen with','notification','settings']
group_size=pd.value_counts(train.source_system_tab)
print(group_size)
subgroup_names=['Male','Female']
subgroup_size=pd.value_counts(members.gender)
# Create colors
a, b, c,d,e,f,g,h=[plt.cm.autumn, plt.cm.GnBu, plt.cm.YlGn,plt.cm.Purples,plt.cm.cool,plt.cm.RdPu,plt.cm.BuPu,plt.cm.bone]
# First Ring (outside)
fig, ax = plt.subplots()
ax.axis('equal')
mypie, texts= ax.pie(group_size, radius=3.0,labels=group_names, colors=[a(0.6), b(0.6), c(0.6),d(0.6), e(0.6), f(0.6),g(0.6)])
plt.setp( mypie, width=0.3, edgecolor='white')
# Second Ring (Inside)
#mypie2, texts1 = ax.pie(subgroup_size, radius=3.0-0.3, labels=subgroup_names, labeldistance=0.7, colors=[h(0.5), b(0.4)])
#plt.setp( mypie2, width=0.3, edgecolor='white')
#plt.margins(0,0)
#for t in texts:
# t.set_size(25.0)
#for t in texts1:
#t.set_size(25.0)
plt.legend()
# show it
plt.show()
In [14]:
print(members.describe())
In [15]:
print(songs.describe())
In [16]:
mpl.rcParams['font.size'] = 40.0
plt.figure(figsize = (20, 20))
sns.distplot(members.registration_init_time)
sns.set(font_scale=2)
plt.ylabel('ecdf',fontsize=50)
plt.xlabel('registration time ' ,fontsize=50)
Out[16]:
In [17]:
members.describe()
Out[17]:
In [18]:
songs.describe()
Out[18]:
In [19]:
train.describe()
Out[19]:
In [20]:
train.info()
In [21]:
members.info()
In [22]:
train_members = pd.merge(train, members, on='msno', how='inner')
train_merged = pd.merge(train_members, songs, on='song_id', how='outer')
print(train_merged.head())
In [23]:
test_members = pd.merge(test, members, on='msno', how='inner')
test_merged = pd.merge(test_members, songs, on='song_id', how='outer')
print(test_merged.head())
print(len(test_merged.columns))
In [24]:
del train_members
del test_members
In [25]:
ax = sns.countplot(y=train_merged.dtypes, data=train_merged)
In [26]:
print(train_merged.columns.to_series().groupby(train_merged.dtypes).groups)
print(test_merged.columns.to_series().groupby(test_merged.dtypes).groups)
In [27]:
msno.heatmap(train_merged)
#msno.matrix(train_merged)
Out[27]:
In [28]:
#msno.dendrogram(train_merged)
In [29]:
#--- Function to check if missing values are present and if so print the columns having them ---
def check_missing_values(df):
print (df.isnull().values.any())
if (df.isnull().values.any() == True):
columns_with_Nan = df.columns[df.isnull().any()].tolist()
print(columns_with_Nan)
for col in columns_with_Nan:
print("%s : %d" % (col, df[col].isnull().sum()))
check_missing_values(train_merged)
check_missing_values(test_merged)
In [30]:
#--- Function to replace Nan values in columns of type float with -5 ---
def replace_Nan_non_object(df):
object_cols = list(df.select_dtypes(include=['float']).columns)
for col in object_cols:
df[col]=df[col].fillna(np.int(-5))
replace_Nan_non_object(train_merged)
replace_Nan_non_object(test_merged)
In [31]:
#--- memory consumed by train dataframe ---
mem = train_merged.memory_usage(index=True).sum()
print("Memory consumed by training set : {} MB" .format(mem/ 1024**2))
#--- memory consumed by test dataframe ---
mem = test_merged.memory_usage(index=True).sum()
print("Memory consumed by test set : {} MB" .format(mem/ 1024**2))
In [32]:
def change_datatype(df):
float_cols = list(df.select_dtypes(include=['float']).columns)
for col in float_cols:
if ((np.max(df[col]) <= 127) and(np.min(df[col] >= -128))):
df[col] = df[col].astype(np.int8)
elif ((np.max(df[col]) <= 32767) and(np.min(df[col] >= -32768))):
df[col] = df[col].astype(np.int16)
elif ((np.max(df[col]) <= 2147483647) and(np.min(df[col] >= -2147483648))):
df[col] = df[col].astype(np.int32)
else:
df[col] = df[col].astype(np.int64)
change_datatype(train_merged)
change_datatype(test_merged)
In [33]:
data = train_merged.groupby('target').aggregate({'msno':'count'}).reset_index()
a4_dims = (15, 8)
fig, ax = plt.subplots(figsize=a4_dims)
ax = sns.barplot(x='target', y='msno', data=data)
In [34]:
mpl.rcParams['font.size'] = 40.0
plt.figure(figsize = (20, 20))
data=train_merged.groupby('source_system_tab').aggregate({'msno':'count'}).reset_index()
sns.barplot(x='source_system_tab',y='msno',data=data)
Out[34]:
In [35]:
data = train_merged.groupby('source_screen_name').aggregate({'msno':'count'}).reset_index()
a4_dims = (15, 7)
fig, ax = plt.subplots(figsize=a4_dims)
ax = sns.barplot(x='source_screen_name', y='msno', data=data)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
Out[35]:
In [36]:
data = train_merged.groupby('source_type').aggregate({'msno':'count'}).reset_index()
a4_dims = (15, 7)
fig, ax = plt.subplots(figsize=a4_dims)
ax = sns.barplot(x='source_type', y='msno', data=data)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
Out[36]:
In [37]:
data = train_merged.groupby('language').aggregate({'msno':'count'}).reset_index()
a4_dims = (15, 7)
fig, ax = plt.subplots(figsize=a4_dims)
ax = sns.barplot(x='language', y='msno', data=data)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
Out[37]:
In [38]:
data = train_merged.groupby('registered_via').aggregate({'msno':'count'}).reset_index()
a4_dims = (15, 7)
fig, ax = plt.subplots(figsize=a4_dims)
ax = sns.barplot(x='registered_via', y='msno', data=data)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
Out[38]:
In [39]:
print(train_merged.columns)
data = train_merged.groupby('city').aggregate({'msno':'count'}).reset_index()
a4_dims = (15, 7)
fig, ax = plt.subplots(figsize=a4_dims)
ax = sns.barplot(x='city', y='msno', data=data)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
Out[39]:
In [40]:
a4_dims = (15, 7)
fig, ax = plt.subplots(figsize=a4_dims)
ax=sns