Kaggle : Airbnb New User Booking

Abstact :

  • purpose of analysis : Predict destination country

  • evaluation metric : NDCG (Normalized discounted cumulative gain)

\[DCG_k=\sum_{i=1}^k\frac{2^{rel_i}-1}{\log_2{\left(i+1\right)}}\] \[nDCG_k=\frac{DCG_k}{IDCG_k}\]
  • model : Light GBM

  • score : 0.88461 (top 11%)

import re
import gc
from datetime import datetime

import requests
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from IPython.core.display import Image 
from IPython.display import display

from scipy.stats import mode
import scipy.stats as stats

from sklearn.tree import export_graphviz
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel


import lightgbm as lgb
from lightgbm import plot_importance

import warnings

warnings.filterwarnings('ignore')
df_train = pd.read_csv("train_users_2.csv")
df_test = pd.read_csv("test_users.csv")
df_countries = pd.read_csv('countries.csv')
df_sessions = pd.read_csv('sessions.csv')
df_age_bkts = pd.read_csv('age_gender_bkts.csv')
df_all = pd.concat([df_train, df_test], axis=0)
train_id = df_train.id
test_id = df_test.id
all_id = df_all.id
df_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213451 entries, 0 to 213450
Data columns (total 16 columns):
id                         213451 non-null object
date_account_created       213451 non-null object
timestamp_first_active     213451 non-null int64
date_first_booking         88908 non-null object
gender                     213451 non-null object
age                        125461 non-null float64
signup_method              213451 non-null object
signup_flow                213451 non-null int64
language                   213451 non-null object
affiliate_channel          213451 non-null object
affiliate_provider         213451 non-null object
first_affiliate_tracked    207386 non-null object
signup_app                 213451 non-null object
first_device_type          213451 non-null object
first_browser              213451 non-null object
country_destination        213451 non-null object
dtypes: float64(1), int64(2), object(13)
memory usage: 26.1+ MB
df_train.head()
iddate_account_createdtimestamp_first_activedate_first_bookinggenderagesignup_methodsignup_flowlanguageaffiliate_channelaffiliate_providerfirst_affiliate_trackedsignup_appfirst_device_typefirst_browsercountry_destination
0gxn3p5htnn2010-06-2820090319043255NaN-unknown-NaNfacebook0endirectdirectuntrackedWebMac DesktopChromeNDF
1820tgsjxq72011-05-2520090523174809NaNMALE38.0facebook0enseogoogleuntrackedWebMac DesktopChromeNDF
24ft3gnwmtx2010-09-28200906092312472010-08-02FEMALE56.0basic3endirectdirectuntrackedWebWindows DesktopIEUS
3bjjt8pjhuk2011-12-05200910310601292012-09-08FEMALE42.0facebook0endirectdirectuntrackedWebMac DesktopFirefoxother
487mebub9p42010-09-14200912080611052010-02-18-unknown-41.0basic0endirectdirectuntrackedWebMac DesktopChromeUS
df_test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62096 entries, 0 to 62095
Data columns (total 15 columns):
id                         62096 non-null object
date_account_created       62096 non-null object
timestamp_first_active     62096 non-null int64
date_first_booking         0 non-null float64
gender                     62096 non-null object
age                        33220 non-null float64
signup_method              62096 non-null object
signup_flow                62096 non-null int64
language                   62096 non-null object
affiliate_channel          62096 non-null object
affiliate_provider         62096 non-null object
first_affiliate_tracked    62076 non-null object
signup_app                 62096 non-null object
first_device_type          62096 non-null object
first_browser              62096 non-null object
dtypes: float64(2), int64(2), object(11)
memory usage: 7.1+ MB
df_test.head()
iddate_account_createdtimestamp_first_activedate_first_bookinggenderagesignup_methodsignup_flowlanguageaffiliate_channelaffiliate_providerfirst_affiliate_trackedsignup_appfirst_device_typefirst_browser
05uwns89zht2014-07-0120140701000006NaNFEMALE35.0facebook0endirectdirectuntrackedMowebiPhoneMobile Safari
1jtl0dijy2j2014-07-0120140701000051NaN-unknown-NaNbasic0endirectdirectuntrackedMowebiPhoneMobile Safari
2xx0ulgorjt2014-07-0120140701000148NaN-unknown-NaNbasic0endirectdirectlinkedWebWindows DesktopChrome
36c6puo6ix02014-07-0120140701000215NaN-unknown-NaNbasic0endirectdirectlinkedWebWindows DesktopIE
4czqhjk3yfe2014-07-0120140701000305NaN-unknown-NaNbasic0endirectdirectuntrackedWebMac DesktopSafari
df_sessions.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10567737 entries, 0 to 10567736
Data columns (total 6 columns):
user_id          object
action           object
action_type      object
action_detail    object
device_type      object
secs_elapsed     float64
dtypes: float64(1), object(5)
memory usage: 483.8+ MB
df_sessions.head()
user_idactionaction_typeaction_detaildevice_typesecs_elapsed
0d1mm9tcy42lookupNaNNaNWindows Desktop319.0
1d1mm9tcy42search_resultsclickview_search_resultsWindows Desktop67753.0
2d1mm9tcy42lookupNaNNaNWindows Desktop301.0
3d1mm9tcy42search_resultsclickview_search_resultsWindows Desktop22141.0
4d1mm9tcy42lookupNaNNaNWindows Desktop435.0
df_countries.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
country_destination              10 non-null object
lat_destination                  10 non-null float64
lng_destination                  10 non-null float64
distance_km                      10 non-null float64
destination_km2                  10 non-null float64
destination_language             10 non-null object
language_levenshtein_distance    10 non-null float64
dtypes: float64(5), object(2)
memory usage: 640.0+ bytes
df_countries.head()
country_destinationlat_destinationlng_destinationdistance_kmdestination_km2destination_languagelanguage_levenshtein_distance
0AU-26.853388133.27516015297.74407741220.0eng0.00
1CA62.393303-96.8181462828.13339984670.0eng0.00
2DE51.16570710.4527647879.5680357022.0deu72.61
3ES39.896027-2.4876947730.7240505370.0spa92.25
4FR46.2321932.2096677682.9450643801.0fra92.06
df_age_bkts.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 5 columns):
age_bucket                 420 non-null object
country_destination        420 non-null object
gender                     420 non-null object
population_in_thousands    420 non-null float64
year                       420 non-null float64
dtypes: float64(2), object(3)
memory usage: 16.5+ KB
df_age_bkts.head()
age_bucketcountry_destinationgenderpopulation_in_thousandsyear
0100+AUmale1.02015.0
195-99AUmale9.02015.0
290-94AUmale47.02015.0
385-89AUmale118.02015.0
480-84AUmale199.02015.0

country destination

df_train.country_destination.value_counts()
NDF      124543
US        62376
other     10094
FR         5023
IT         2835
GB         2324
ES         2249
CA         1428
DE         1061
NL          762
AU          539
PT          217
Name: country_destination, dtype: int64
ax = sns.countplot(x=df_train.country_destination , data=df_train)
plt.show()
round(df_train["country_destination"].value_counts() / len(df_train) * 100, 2)
NDF      58.35
US       29.22
other     4.73
FR        2.35
IT        1.33
GB        1.09
ES        1.05
CA        0.67
DE        0.50
NL        0.36
AU        0.25
PT        0.10
Name: country_destination, dtype: float64

feature engineering

date time convert

#date account create
df_all["date_account_created"] = pd.to_datetime(df_all["date_account_created"], format = "%Y-%m-%d")

#timestamp first active
df_all["timestamp_first_active"] = pd.to_datetime(df_all["timestamp_first_active"], format="%Y%m%d%H%M%S")
df_all['create_year'] = df_all["date_account_created"].apply(lambda x : x.year)
df_all['create_month'] = df_all["date_account_created"].apply(lambda x : x.month)
df_all['create_day'] = df_all["date_account_created"].apply(lambda x : x.day)

df_all['active_year'] = df_all["timestamp_first_active"].apply(lambda x : x.year)
df_all['active_month'] = df_all["timestamp_first_active"].apply(lambda x : x.month)
df_all['active_day'] = df_all["timestamp_first_active"].apply(lambda x : x.day)
df_all.shape
(275547, 22)
df_all.head()
affiliate_channelaffiliate_provideragecountry_destinationdate_account_createddate_first_bookingfirst_affiliate_trackedfirst_browserfirst_device_typegender...signup_appsignup_flowsignup_methodtimestamp_first_activecreate_yearcreate_monthcreate_dayactive_yearactive_monthactive_day
0directdirectNaNNDF2010-06-28NaNuntrackedChromeMac Desktop-unknown-...Web0facebook2009-03-19 04:32:5520106282009319
1seogoogle38.0NDF2011-05-25NaNuntrackedChromeMac DesktopMALE...Web0facebook2009-05-23 17:48:0920115252009523
2directdirect56.0US2010-09-282010-08-02untrackedIEWindows DesktopFEMALE...Web3basic2009-06-09 23:12:472010928200969
3directdirect42.0other2011-12-052012-09-08untrackedFirefoxMac DesktopFEMALE...Web0facebook2009-10-31 06:01:29201112520091031
4directdirect41.0US2010-09-142010-02-18untrackedChromeMac Desktop-unknown-...Web0basic2009-12-08 06:11:0520109142009128

5 rows × 22 columns

lagging time

lagging = df_all["timestamp_first_active"] - df_all["date_account_created"]
#lagging time days, log seconds
df_all["lag_days"] = lagging.apply(lambda x : -1 * x.days)
df_all["lag_seconds"] = np.log(lagging.apply(lambda x : x.seconds))
df_all.head()
affiliate_channelaffiliate_provideragecountry_destinationdate_account_createddate_first_bookingfirst_affiliate_trackedfirst_browserfirst_device_typegender...signup_methodtimestamp_first_activecreate_yearcreate_monthcreate_dayactive_yearactive_monthactive_daylag_dayslag_seconds
0directdirectNaNNDF2010-06-28NaNuntrackedChromeMac Desktop-unknown-...facebook2009-03-19 04:32:55201062820093194669.703511
1seogoogle38.0NDF2011-05-25NaNuntrackedChromeMac DesktopMALE...facebook2009-05-23 17:48:092011525200952373211.068028
2directdirect56.0US2010-09-282010-08-02untrackedIEWindows DesktopFEMALE...basic2009-06-09 23:12:47201092820096947611.333404
3directdirect42.0other2011-12-052012-09-08untrackedFirefoxMac DesktopFEMALE...facebook2009-10-31 06:01:292011125200910317659.984560
4directdirect41.0US2010-09-142010-02-18untrackedChromeMac Desktop-unknown-...basic2009-12-08 06:11:052010914200912828010.010771

5 rows × 24 columns

holiday

def get_holidays(year):
    response = requests.get("https://www.timeanddate.com/calendar/custom.html?year="+str(year)+"                                &country=1&cols=3&df=1&hol=25")
    dom = BeautifulSoup(response.content, "html.parser")

    trs = dom.select("table.cht.lpad tr")

    df = pd.DataFrame(columns=["date", "holiday"])
    for tr in trs:
        datestr = tr.select_one("td:nth-of-type(1)").text
        date = datetime.strptime("{} {}".format(year, datestr), '%Y %b %d')
        holiday = tr.select_one("td:nth-of-type(2)").text
        df.loc[len(df)] = {"date" : date, "holiday": 1}
    return df

holiday_ls = []
for year in range(2009, 2015):
    df = get_holidays(year)
    holiday_ls.append(df)
    holiday_df = pd.concat(holiday_ls)
select_date = list(holiday_df["date"].astype("str"))
holiday = df_all.timestamp_first_active.apply(lambda x : str(x.date())).isin(select_date)

df_all["holiday"] = holiday
df_all['holiday'] = 1 * (df_all.holiday == True)
df_all.head()
affiliate_channelaffiliate_provideragecountry_destinationdate_account_createddate_first_bookingfirst_affiliate_trackedfirst_browserfirst_device_typegender...timestamp_first_activecreate_yearcreate_monthcreate_dayactive_yearactive_monthactive_daylag_dayslag_secondsholiday
0directdirectNaNNDF2010-06-28NaNuntrackedChromeMac Desktop-unknown-...2009-03-19 04:32:55201062820093194669.7035110
1seogoogle38.0NDF2011-05-25NaNuntrackedChromeMac DesktopMALE...2009-05-23 17:48:092011525200952373211.0680280
2directdirect56.0US2010-09-282010-08-02untrackedIEWindows DesktopFEMALE...2009-06-09 23:12:47201092820096947611.3334040
3directdirect42.0other2011-12-052012-09-08untrackedFirefoxMac DesktopFEMALE...2009-10-31 06:01:292011125200910317659.9845601
4directdirect41.0US2010-09-142010-02-18untrackedChromeMac Desktop-unknown-...2009-12-08 06:11:052010914200912828010.0107710

5 rows × 25 columns

weekend

weekday = df_all.filter(items=['id','timestamp_first_active'])
weekday = pd.to_datetime(weekday["timestamp_first_active"], format="%Y-%m-%d")
weekday = weekday.dt.dayofweek

df_all["weekend"] = weekday.apply(lambda x : 1 if x>=5 else 0)
df_all.head()
affiliate_channelaffiliate_provideragecountry_destinationdate_account_createddate_first_bookingfirst_affiliate_trackedfirst_browserfirst_device_typegender...create_yearcreate_monthcreate_dayactive_yearactive_monthactive_daylag_dayslag_secondsholidayweekend
0directdirectNaNNDF2010-06-28NaNuntrackedChromeMac Desktop-unknown-...201062820093194669.70351100
1seogoogle38.0NDF2011-05-25NaNuntrackedChromeMac DesktopMALE...2011525200952373211.06802801
2directdirect56.0US2010-09-282010-08-02untrackedIEWindows DesktopFEMALE...201092820096947611.33340400
3directdirect42.0other2011-12-052012-09-08untrackedFirefoxMac DesktopFEMALE...2011125200910317659.98456011
4directdirect41.0US2010-09-142010-02-18untrackedChromeMac Desktop-unknown-...2010914200912828010.01077100

5 rows × 26 columns

df_all = df_all.drop("date_account_created" , axis=1)
df_all = df_all.drop("timestamp_first_active" , axis=1)

faithless sign-in

checklist = (df_all['age'] < 120) & (df_all['gender'] != '-unknown-')

df_all['faithless_sign'] = checklist.apply(lambda x : 0 if x == True else 1)
df_all.head()
affiliate_channelaffiliate_provideragecountry_destinationdate_first_bookingfirst_affiliate_trackedfirst_browserfirst_device_typegenderid...create_monthcreate_dayactive_yearactive_monthactive_daylag_dayslag_secondsholidayweekendfaithless_sign
0directdirectNaNNDFNaNuntrackedChromeMac Desktop-unknown-gxn3p5htnn...62820093194669.703511001
1seogoogle38.0NDFNaNuntrackedChromeMac DesktopMALE820tgsjxq7...525200952373211.068028010
2directdirect56.0US2010-08-02untrackedIEWindows DesktopFEMALE4ft3gnwmtx...92820096947611.333404000
3directdirect42.0other2012-09-08untrackedFirefoxMac DesktopFEMALEbjjt8pjhuk...125200910317659.984560110
4directdirect41.0US2010-02-18untrackedChromeMac Desktop-unknown-87mebub9p4...914200912828010.010771001

5 rows × 25 columns

null data

df_train.isnull().sum()
id                              0
date_account_created            0
timestamp_first_active          0
date_first_booking         124543
gender                          0
age                         87990
signup_method                   0
signup_flow                     0
language                        0
affiliate_channel               0
affiliate_provider              0
first_affiliate_tracked      6065
signup_app                      0
first_device_type               0
first_browser                   0
country_destination             0
dtype: int64
df_test.isnull().sum()
id                             0
date_account_created           0
timestamp_first_active         0
date_first_booking         62096
gender                         0
age                        28876
signup_method                  0
signup_flow                    0
language                       0
affiliate_channel              0
affiliate_provider             0
first_affiliate_tracked       20
signup_app                     0
first_device_type              0
first_browser                  0
dtype: int64
#null data of country_destination is due to concanate train / test data.
df_all.isnull().sum()
affiliate_channel               0
affiliate_provider              0
age                        116866
country_destination         62096
date_first_booking         186639
first_affiliate_tracked      6085
first_browser                   0
first_device_type               0
gender                          0
id                              0
language                        0
signup_app                      0
signup_flow                     0
signup_method                   0
create_year                     0
create_month                    0
create_day                      0
active_year                     0
active_month                    0
active_day                      0
lag_days                        0
lag_seconds                     0
holiday                         0
weekend                         0
faithless_sign                  0
dtype: int64
print('Train data missing data ratio')
print('date_frist_booking      :',round(df_train.date_first_booking.isnull().sum() / len(df_train) * 100, 2), "%")
print('age                     :',round(df_train.age.isnull().sum() / len(df_train) * 100, 2), "%")
print('first_affiliate_tracked :',round(df_train.first_affiliate_tracked.isnull().sum() / len(df_train) * 100, 2), "%")
Train data missing data ratio
date_frist_booking      : 58.35 %
age                     : 41.22 %
first_affiliate_tracked : 2.84 %
print('Test data missing data ratio')
print('date_frist_booking      :',round(df_test.date_first_booking.isnull().sum() / len(df_test) * 100, 2), "%")
print('age                     :',round(df_test.age.isnull().sum() / len(df_test) * 100, 2), "%")
print('first_affiliate_tracked :',round(df_test.first_affiliate_tracked.isnull().sum() / len(df_test) * 100, 2), "%")
Test data missing data ratio
date_frist_booking      : 100.0 %
age                     : 46.5 %
first_affiliate_tracked : 0.03 %

drop data_first_booking data

df_all = df_all.drop("date_first_booking", axis=1)

fill first_affiliate_tracked data by mode

sns.countplot(df_all["first_affiliate_tracked"])
plt.show()

png

df_all.first_affiliate_tracked.mode()
0    untracked
dtype: object
df_all["first_affiliate_tracked"] = df_all["first_affiliate_tracked"].replace(np.nan, "untracked")

age data

#more than 200 years old data
#keep this data for faithless-signin
#training data only
len(df_train[df_train['age']>200])
779
df_train[df_train['age']>200].head()
iddate_account_createdtimestamp_first_activedate_first_bookinggenderagesignup_methodsignup_flowlanguageaffiliate_channelaffiliate_providerfirst_affiliate_trackedsignup_appfirst_device_typefirst_browsercountry_destination
388v2x0ms9c622010-04-11201004110656022010-04-13-unknown-2014.0basic3enothercraigslistuntrackedWebWindows DesktopFirefoxFR
673umf1wdk9uc2010-05-2520100525155541NaNFEMALE2014.0basic2enothercraigslistuntrackedWebMac DesktopSafariNDF
1040m82epwn7i82010-07-14201007142305562010-07-15MALE2014.0facebook0enothercraigslistuntrackedWebMac DesktopChromeUS
11772th813zdx72010-07-25201007252344192010-07-26MALE2013.0facebook3endirectdirectuntrackedWebMac DesktopChromeUS
12003amf04n3o32010-07-27201007271904472010-07-29FEMALE2014.0basic2endirectdirectuntrackedWebWindows DesktopIEUS
#on each destinations, it has various age distribution. 
#It would be better filling the null age data than dropping the age data. 
age_temp = df_train.copy()
age_temp[ age_temp['age'] > 200 ] = np.nan
sns.boxplot(x=age_temp.country_destination, y=age_temp.age, data=age_temp)
plt.show()

png

one-hot encoding before fill age data

df_age = df_all.filter(items = ['age', 'country_destination','id', 'gender'])
df_dummy = df_all.filter(items = ['affiliate_channel', 'affiliate_provider',
                                       'first_affiliate_tracked', 'first_browser', 'first_device_type',
                                       'language', 'signup_app', 'signup_flow', 'signup_method', 
                                       'create_year', 'create_month', 'create_day', 
                                       'active_year', 'active_month', 'active_day', 'lag_days', 'lag_seconds', 
                                       'holiday', 'weekend'])
    
df_dummy = pd.get_dummies(df_dummy)
df_all = pd.concat([df_age, df_dummy], axis=1)
df_all.shape
(275547, 146)
df_all.head()
agecountry_destinationidgendersignup_flowcreate_yearcreate_monthcreate_dayactive_yearactive_month...language_trlanguage_zhsignup_app_Androidsignup_app_Mowebsignup_app_Websignup_app_iOSsignup_method_basicsignup_method_facebooksignup_method_googlesignup_method_weibo
0NaNNDFgxn3p5htnn-unknown-0201062820093...0000100100
138.0NDF820tgsjxq7MALE0201152520095...0000100100
256.0US4ft3gnwmtxFEMALE3201092820096...0000101000
342.0otherbjjt8pjhukFEMALE02011125200910...0000100100
441.0US87mebub9p4-unknown-02010914200912...0000101000

5 rows × 146 columns

fill age data

#divide train / test by null age data
age_train = df_all[df_all["age"].notnull()].reset_index(drop=True)
age_test = df_all[df_all["age"].isnull()].reset_index(drop=True)
#divide 5 cluster age data
bins = [0, 15, 25, 35, 60, 9999]
labels = ["underage", "tweenty", "thirty", "mid_old", "old"]
cats = pd.cut(age_train['age'], bins, labels=labels)
cats = pd.DataFrame(cats)
age_train_id = age_train.id
age_test_id = age_test.id

age_train = age_train.drop(['id', 'age', 'country_destination', 'gender'], axis=1)
age_test = age_test.drop(['id', 'age', 'country_destination', 'gender'], axis=1)
X = age_train
y = cats

age_train.shape, y.shape, age_test.shape
((158681, 142), (158681, 1), (116866, 142))
#model recall rate is so low, but it gives better cross validation score for final prediction model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
    
model_age = lgb.LGBMClassifier(boosting_type='gbdt', n_jobs=-1, reg_alpha=0.5, reg_lambda=0.5).fit(X_train, y_train)
pred_age = model_age.predict(X_test)

print(classification_report(y_test, pred_age))
             precision    recall  f1-score   support

    mid_old       0.48      0.38      0.42     11150
        old       0.75      0.00      0.00      2044
     thirty       0.49      0.78      0.60     14036
    tweenty       0.45      0.04      0.07      4490
   underage       0.25      0.06      0.10        17

avg / total       0.50      0.48      0.42     31737
#prediction age
pred_age = model_age.predict(age_test)
pred_age = pd.DataFrame(pred_age, columns=['age'])
pred_age = pd.concat([pred_age, age_test_id], axis=1)
pred_age["age"] = pred_age["age"].replace({'underage':15, "tweenty" : 25, "thirty" : 35, 'mid_old' : 45, 'old' : 60})

#original age
origin_age = y
origin_age = pd.DataFrame(origin_age, columns=['age'])
origin_age = pd.concat([origin_age, age_train_id], axis=1)
origin_age["age"] = origin_age["age"].replace({'underage':15, "tweenty" : 25, "thirty" : 35, 'mid_old' : 45, 'old' : 60})

#concat original age and prediction age
age = pd.concat([origin_age, pred_age], axis=0)
print('age lenght check :', len(age))
age.head()
age lenght check : 275547
ageid
045820tgsjxq7
1454ft3gnwmtx
245bjjt8pjhuk
34587mebub9p4
445lsw9q7uk0j
df_all = df_all.drop("age" , axis=1)

df_all = pd.merge(df_all, age, on="id", how="left")
df_all.head()
country_destinationidgendersignup_flowcreate_yearcreate_monthcreate_dayactive_yearactive_monthactive_day...language_zhsignup_app_Androidsignup_app_Mowebsignup_app_Websignup_app_iOSsignup_method_basicsignup_method_facebooksignup_method_googlesignup_method_weiboage
0NDFgxn3p5htnn-unknown-020106282009319...00010010035
1NDF820tgsjxq7MALE020115252009523...00010010045
2US4ft3gnwmtxFEMALE32010928200969...00010100045
3otherbjjt8pjhukFEMALE0201112520091031...00010010045
4US87mebub9p4-unknown-020109142009128...00010100045

5 rows × 146 columns

df_all.isnull().sum()
country_destination                       62096
id                                            0
gender                                        0
signup_flow                                   0
create_year                                   0
create_month                                  0
create_day                                    0
active_year                                   0
active_month                                  0
active_day                                    0
lag_days                                      0
lag_seconds                                   0
holiday                                       0
weekend                                       0
affiliate_channel_api                         0
affiliate_channel_content                     0
affiliate_channel_direct                      0
affiliate_channel_other                       0
affiliate_channel_remarketing                 0
affiliate_channel_sem-brand                   0
affiliate_channel_sem-non-brand               0
affiliate_channel_seo                         0
affiliate_provider_baidu                      0
affiliate_provider_bing                       0
affiliate_provider_craigslist                 0
affiliate_provider_daum                       0
affiliate_provider_direct                     0
affiliate_provider_email-marketing            0
affiliate_provider_facebook                   0
affiliate_provider_facebook-open-graph        0
                                          ...  
language_el                                   0
language_en                                   0
language_es                                   0
language_fi                                   0
language_fr                                   0
language_hr                                   0
language_hu                                   0
language_id                                   0
language_is                                   0
language_it                                   0
language_ja                                   0
language_ko                                   0
language_nl                                   0
language_no                                   0
language_pl                                   0
language_pt                                   0
language_ru                                   0
language_sv                                   0
language_th                                   0
language_tr                                   0
language_zh                                   0
signup_app_Android                            0
signup_app_Moweb                              0
signup_app_Web                                0
signup_app_iOS                                0
signup_method_basic                           0
signup_method_facebook                        0
signup_method_google                          0
signup_method_weibo                           0
age                                           0
Length: 146, dtype: int64

filling gender data

df_all.gender.value_counts()
-unknown-    129480
FEMALE        77524
MALE          68209
OTHER           334
Name: gender, dtype: int64
sns.countplot(df_all["gender"])
plt.show()

png

df_all["gender"] = df_all["gender"].replace(['-unknown-', 'OTHER'], np.nan)

gender_train = df_all[df_all["gender"].notnull()].reset_index()
gender_test = df_all[df_all["gender"].isnull()].reset_index()
y = gender_train.gender

gender_train_id = gender_train.id
gender_test_id = gender_test.id

gender_train = gender_train.drop(['id', 'age', 'country_destination', 'gender'], axis=1)
gender_test = gender_test.drop(['id', 'age', 'country_destination', 'gender'], axis=1)
X = gender_train

gender_train.shape, y.shape, gender_test.shape
((145733, 143), (145733,), (129814, 143))
#model recall rate is so low, but it gives better cross validation score for final prediction model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# X_train, X_test, y_train = gender_train, gender_test, y
    
model_age = lgb.LGBMClassifier(n_estimators=500, n_jobs=-1, reg_alpha=1).fit(X_train, y_train)
pred_age = model_age.predict(X_test)

print(classification_report(y_test, pred_age))
             precision    recall  f1-score   support

     FEMALE       0.57      0.68      0.62     15550
       MALE       0.54      0.43      0.48     13597

avg / total       0.56      0.56      0.55     29147
pred_gender = model_age.predict(gender_test)
pred_gender = pd.DataFrame(pred_gender)
#prediction age
pred_gender = model_age.predict(gender_test)
pred_gender = pd.DataFrame(pred_gender, columns=['gender'])
pred_gender = pd.concat([pred_gender, gender_test_id], axis=1)

#original age
origin_gender = y
origin_gender = pd.DataFrame(origin_gender, columns=['gender'])
origin_gender = pd.concat([origin_gender, gender_train_id], axis=1)

#concat original age and prediction age
gender = pd.concat([origin_gender, pred_gender], axis=0)
print('gender lenght check :', len(gender))
gender.head()
gender lenght check : 275547
genderid
0MALE820tgsjxq7
1FEMALE4ft3gnwmtx
2FEMALEbjjt8pjhuk
3FEMALElsw9q7uk0j
4FEMALE0d01nltbrs
df_all = df_all.drop("gender" , axis=1)

df_all = pd.merge(df_all, gender, on="id", how="left")
df_all.head()
country_destinationidsignup_flowcreate_yearcreate_monthcreate_dayactive_yearactive_monthactive_daylag_days...signup_app_Androidsignup_app_Mowebsignup_app_Websignup_app_iOSsignup_method_basicsignup_method_facebooksignup_method_googlesignup_method_weiboagegender
0NDFgxn3p5htnn020106282009319466...0010010035FEMALE
1NDF820tgsjxq7020115252009523732...0010010045MALE
2US4ft3gnwmtx32010928200969476...0010100045FEMALE
3otherbjjt8pjhuk0201112520091031765...0010010045FEMALE
4US87mebub9p4020109142009128280...0010100045MALE

5 rows × 146 columns

df_all.gender.value_counts()
FEMALE    169870
MALE      105677
Name: gender, dtype: int64

sessions

action, action_type, action_detail count

def make_merged_sessions():
    
    sessions = df_sessions

    #session elapsed group by sum, mean
    ses = sessions.filter(items=('user_id', 'secs_elapsed'))
     
    print(" groupby aggregation in process...")

        
    ses_groupby_sum = ses.groupby("user_id").agg(np.sum)
    ses_groupby_mean = ses.groupby("user_id").agg(np.mean)
    
    #fill null data with mode value
    sessions["action"] = sessions["action"].fillna("show")
    sessions["action_type"] = sessions["action_type"].fillna("view")
    sessions["action_detail"] = sessions["action_detail"].fillna("view_search_results")
    
    id_groupby = sessions.groupby(sessions["user_id"]).agg(mode)
    
    device_type = []
    action = []
    action_type = []
    action_detail = []
    secs_elapsed = []

    print("id groupby in process...")
    
    for i in range(len(id_groupby.index)):
        device_type.append(id_groupby['device_type'][i][0])
        action.append(id_groupby['action'][i][0])
        action_type.append(id_groupby['action_type'][i][0])
        action_detail.append(id_groupby['action_detail'][i][0])
        secs_elapsed.append(id_groupby['secs_elapsed'][i][0])
    
    id_groupby_df = pd.DataFrame({"id":id_groupby.index ,
                                  "device_type":device_type ,
                                  "action":action,
                                  "action_type":action_type,
                                  "action_detail":action_detail,
                                  "secs_elapsed":secs_elapsed
                                  })
    
     
    print("merge sessions in process...")
    
    merge_ses_groupby = pd.merge(ses_groupby_sum, ses_groupby_mean, left_index=True, right_index=True, how="left")
    merge_ses_groupby = merge_ses_groupby.rename(columns={'secs_elapsed_x': 'secs_sum', 'secs_elapsed_y': 'secs_mean'})
    
    merged_sessions = pd.merge(id_groupby_df, merge_ses_groupby, left_on="id", right_index=True, how="left")
    
    merged_sessions['secs_elapsed'] = merged_sessions['secs_elapsed'].astype(float)
    
    merged_sessions['secs_mean'] = merged_sessions['secs_mean'].fillna(0)
    
    merged_sessions.to_csv("merged_sessions.csv", index=False)
%%time
make_merged_sessions()
 groupby aggregation in process...
id groupby in process...
merge sessions in process...
CPU times: user 4min 32s, sys: 4.14 s, total: 4min 36s
Wall time: 4min 35s
def remove_word():
    
    merged_sessions = pd.read_csv("merged_sessions.csv")

    def remove(word):
        word = re.sub("''", "", word)
        word = re.sub("\W", "", word)
        return word

    merged_sessions["action"] = merged_sessions["action"].apply(remove)
    merged_sessions["action_detail"] = merged_sessions["action_detail"].apply(remove)
    merged_sessions["action_type"] = merged_sessions["action_type"].apply(remove)
    merged_sessions["device_type"] = merged_sessions["device_type"].apply(remove)


    merged_sessions["action_detail"] = merged_sessions["action_detail"].replace({"['-unknown-']":"unknown"})
    merged_sessions["action_type"] = merged_sessions["action_type"].replace({"['-unknown-']":"unknown"})
    merged_sessions["device_type"] = merged_sessions["device_type"].replace({"['-unknown-']":"unknown", "['Android App Unknown Phone/Tablet']": "Androd_unkown_phone"})

    merged_sessions.to_csv("merged_sessions.csv", index=False)
%%time
remove_word()
CPU times: user 2.67 s, sys: 31.9 ms, total: 2.7 s
Wall time: 2.72 s
def sessions_detail_add():

    merged_sessions = pd.read_csv("merged_sessions.csv")
    sessions = df_sessions

    print("groupby count in process...")

    
    tmp = sessions.groupby(["user_id", "action_type"])["device_type"].count().unstack().fillna(0)
    sessions_at = pd.DataFrame(tmp)
    sessions_at.rename(columns = lambda x : "type__" + x, inplace = True)

    tmp = sessions.groupby(["user_id", "action"])["device_type"].count().unstack().fillna(0)
    sessions_a = pd.DataFrame(tmp)
    sessions_a.rename(columns = lambda x : "action__" + x, inplace = True)

    tmp = sessions.groupby(["user_id", "action_detail"])["device_type"].count().unstack().fillna(0)
    sessions_ad = pd.DataFrame(tmp)
    sessions_ad.rename(columns = lambda x : "detail__" + x, inplace = True)

    df_session_info = sessions_at.merge(sessions_a, how = "outer", left_index = True, right_index = True)
    df_session_info = df_session_info.merge(sessions_ad, how = "left", left_index = True, right_index = True)

    df_session_info.drop(["type__-unknown-", "detail__-unknown-"], axis = 1, inplace = True)
    df_session_info = df_session_info.fillna(0)

    print("merge sessions in process...")
    
    last_merged_sessions = pd.merge(merged_sessions, df_session_info, left_on='id', right_index=True, how='left')

    last_merged_sessions.to_csv("merged_sessions.csv", index=False)
%%time
sessions_detail_add()
groupby count in process...
merge sessions in process...
CPU times: user 1min 10s, sys: 5.69 s, total: 1min 16s
Wall time: 1min 17s
merged_sessions = pd.read_csv("merged_sessions.csv")
merged_sessions.head()
actionaction_detailaction_typedevice_typeidsecs_elapsedsecs_sumsecs_meantype__booking_requesttype__booking_response...detail__view_resolutionsdetail__view_search_resultsdetail__view_security_checksdetail__view_user_real_namesdetail__wishlistdetail__wishlist_content_updatedetail__wishlist_notedetail__your_listingsdetail__your_reservationsdetail__your_trips
0showview_search_resultsviewMacDesktop00023iyk9l820.0867896.022253.7435901.00.0...0.08.00.00.00.04.00.00.00.02.0
1showview_search_resultsviewMacDesktop0010k6l0om125.0586543.09460.3709680.00.0...0.025.00.00.00.08.00.00.00.00.0
2searchview_search_resultsclickAndroidAppUnknownPhoneTablet001wyh0pz8623.0282965.03179.3820220.00.0...0.071.00.00.00.00.00.00.00.00.0
3showp3viewunknown0028jgx1x13.0297010.09900.3333330.00.0...0.09.00.00.00.00.00.00.00.00.0
4showview_search_resultsviewiPhone002qnbzfs51.06487080.08232.3350251.00.0...0.0202.00.00.00.00.00.00.00.00.0

5 rows × 530 columns

merged_sessions.shape
(135483, 530)
gc.collect()
141
df_all = pd.merge(df_all, merged_sessions, on="id", how="left")
df_all.head()
country_destinationidsignup_flowcreate_yearcreate_monthcreate_dayactive_yearactive_monthactive_daylag_days...detail__view_resolutionsdetail__view_search_resultsdetail__view_security_checksdetail__view_user_real_namesdetail__wishlistdetail__wishlist_content_updatedetail__wishlist_notedetail__your_listingsdetail__your_reservationsdetail__your_trips
0NDFgxn3p5htnn020106282009319466...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1NDF820tgsjxq7020115252009523732...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2US4ft3gnwmtx32010928200969476...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3otherbjjt8pjhuk0201112520091031765...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4US87mebub9p4020109142009128280...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN

5 rows × 675 columns

df_all.shape
(275547, 675)
gc.collect()
91

one hot encoding

target = df_all.country_destination[:213451]
df_all = df_all.drop(["country_destination", "id"], axis=1)
df_all = pd.get_dummies(df_all)
gc.collect()
123

fill null data with imputer

%%time
## impute the missing value using median
from sklearn.preprocessing import Imputer

## impute the missing value using median
impute_list = df_all.columns.tolist()
# impute_list.remove("id")
# impute_list.remove("country_destination")

imp = Imputer(missing_values='NaN', strategy='median', axis=0)

df_all[impute_list] = imp.fit_transform(df_all[impute_list])
# test[impute_list] = imp.fit_transform(test[impute_list])

gc.collect()
CPU times: user 1min 28s, sys: 22.7 s, total: 1min 51s
Wall time: 1min 51s

split train / test

train = df_all[:213451]
test = df_all[213451:]
train.head()
signup_flowcreate_yearcreate_monthcreate_dayactive_yearactive_monthactive_daylag_dayslag_secondsholiday...device_type_LinuxDesktopdevice_type_MacDesktopdevice_type_OperaPhonedevice_type_Tabletdevice_type_WindowsDesktopdevice_type_WindowsPhonedevice_type_iPadTabletdevice_type_iPhonedevice_type_iPodtouchdevice_type_unknown
00.02010.06.028.02009.03.019.0466.09.7035110.0...0.00.00.00.00.00.00.00.00.00.0
10.02011.05.025.02009.05.023.0732.011.0680280.0...0.00.00.00.00.00.00.00.00.00.0
23.02010.09.028.02009.06.09.0476.011.3334040.0...0.00.00.00.00.00.00.00.00.00.0
30.02011.012.05.02009.010.031.0765.09.9845601.0...0.00.00.00.00.00.00.00.00.00.0
40.02010.09.014.02009.012.08.0280.010.0107710.0...0.00.00.00.00.00.00.00.00.00.0

5 rows × 953 columns

train.isnull().sum()
signup_flow                                 0
create_year                                 0
create_month                                0
create_day                                  0
active_year                                 0
active_month                                0
active_day                                  0
lag_days                                    0
lag_seconds                                 0
holiday                                     0
weekend                                     0
affiliate_channel_api                       0
affiliate_channel_content                   0
affiliate_channel_direct                    0
affiliate_channel_other                     0
affiliate_channel_remarketing               0
affiliate_channel_sem-brand                 0
affiliate_channel_sem-non-brand             0
affiliate_channel_seo                       0
affiliate_provider_baidu                    0
affiliate_provider_bing                     0
affiliate_provider_craigslist               0
affiliate_provider_daum                     0
affiliate_provider_direct                   0
affiliate_provider_email-marketing          0
affiliate_provider_facebook                 0
affiliate_provider_facebook-open-graph      0
affiliate_provider_google                   0
affiliate_provider_gsp                      0
affiliate_provider_meetup                   0
                                           ..
action_detail_view_reservations             0
action_detail_view_search_results           0
action_detail_wishlist                      0
action_detail_wishlist_content_update       0
action_detail_your_listings                 0
action_detail_your_reservations             0
action_detail_your_trips                    0
action_type_booking_request                 0
action_type_click                           0
action_type_data                            0
action_type_message_post                    0
action_type_modify                          0
action_type_partner_callback                0
action_type_submit                          0
action_type_unknown                         0
action_type_view                            0
device_type_AndroidAppUnknownPhoneTablet    0
device_type_AndroidPhone                    0
device_type_Blackberry                      0
device_type_Chromebook                      0
device_type_LinuxDesktop                    0
device_type_MacDesktop                      0
device_type_OperaPhone                      0
device_type_Tablet                          0
device_type_WindowsDesktop                  0
device_type_WindowsPhone                    0
device_type_iPadTablet                      0
device_type_iPhone                          0
device_type_iPodtouch                       0
device_type_unknown                         0
Length: 953, dtype: int64
test.head()
signup_flowcreate_yearcreate_monthcreate_dayactive_yearactive_monthactive_daylag_dayslag_secondsholiday...device_type_LinuxDesktopdevice_type_MacDesktopdevice_type_OperaPhonedevice_type_Tabletdevice_type_WindowsDesktopdevice_type_WindowsPhonedevice_type_iPadTabletdevice_type_iPhonedevice_type_iPodtouchdevice_type_unknown
2134510.02014.07.01.02014.07.01.00.01.7917590.0...0.00.00.00.00.00.00.00.00.01.0
2134520.02014.07.01.02014.07.01.00.03.9318260.0...0.00.00.00.00.00.00.00.00.01.0
2134530.02014.07.01.02014.07.01.00.04.6821310.0...0.00.00.00.01.00.00.00.00.00.0
2134540.02014.07.01.02014.07.01.00.04.9052750.0...0.00.00.00.01.00.00.00.00.00.0
2134550.02014.07.01.02014.07.01.00.05.2203560.0...0.01.00.00.00.00.00.00.00.00.0

5 rows × 953 columns

test.isnull().sum()
signup_flow                                 0
create_year                                 0
create_month                                0
create_day                                  0
active_year                                 0
active_month                                0
active_day                                  0
lag_days                                    0
lag_seconds                                 0
holiday                                     0
weekend                                     0
affiliate_channel_api                       0
affiliate_channel_content                   0
affiliate_channel_direct                    0
affiliate_channel_other                     0
affiliate_channel_remarketing               0
affiliate_channel_sem-brand                 0
affiliate_channel_sem-non-brand             0
affiliate_channel_seo                       0
affiliate_provider_baidu                    0
affiliate_provider_bing                     0
affiliate_provider_craigslist               0
affiliate_provider_daum                     0
affiliate_provider_direct                   0
affiliate_provider_email-marketing          0
affiliate_provider_facebook                 0
affiliate_provider_facebook-open-graph      0
affiliate_provider_google                   0
affiliate_provider_gsp                      0
affiliate_provider_meetup                   0
                                           ..
action_detail_view_reservations             0
action_detail_view_search_results           0
action_detail_wishlist                      0
action_detail_wishlist_content_update       0
action_detail_your_listings                 0
action_detail_your_reservations             0
action_detail_your_trips                    0
action_type_booking_request                 0
action_type_click                           0
action_type_data                            0
action_type_message_post                    0
action_type_modify                          0
action_type_partner_callback                0
action_type_submit                          0
action_type_unknown                         0
action_type_view                            0
device_type_AndroidAppUnknownPhoneTablet    0
device_type_AndroidPhone                    0
device_type_Blackberry                      0
device_type_Chromebook                      0
device_type_LinuxDesktop                    0
device_type_MacDesktop                      0
device_type_OperaPhone                      0
device_type_Tablet                          0
device_type_WindowsDesktop                  0
device_type_WindowsPhone                    0
device_type_iPadTablet                      0
device_type_iPhone                          0
device_type_iPodtouch                       0
device_type_unknown                         0
Length: 953, dtype: int64
train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 213451 entries, 0 to 213450
Columns: 953 entries, signup_flow to device_type_unknown
dtypes: float64(953)
memory usage: 1.5 GB
test.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 62096 entries, 213451 to 275546
Columns: 953 entries, signup_flow to device_type_unknown
dtypes: float64(953)
memory usage: 452.0 MB
train.shape, test.shape, target.shape
((213451, 953), (62096, 953), (213451,))
gc.collect()
14

modeling

def submit_kaggle(df_train, df_test, target, reg_alpha, reg_lambda, learning_rate, n_estimators):
    
    le = LabelEncoder()

    y_train = le.fit_transform(target)
 
    model = lgb.LGBMClassifier(boosting_type= 'gbdt',nthread=3, n_jobs=-1, reg_alpha=reg_alpha, reg_lambda=reg_lambda, max_depth=-1, learning_rate=learning_rate, n_estimators=n_estimators)

    print("cross validation started ...")
    #negative cross entropy
    print(np.mean(cross_val_score(model, df_train, y_train, n_jobs=-1, )))
    print()
    print("model fitting starting ...")
    
    model = model.fit(df_train, y_train)
       
    print("model fitting completed ...")
    print()
    
    predic_proba = model.predict_proba(df_test)
    
    df_submit = pd.DataFrame(columns=["id", "country"])
    ids = []
    cts = []
    for i in range(len(test_id)):
        idx = test_id.iloc[i]
        ids += [idx] * 5
        cts += le.inverse_transform(np.argsort(predic_proba[i])[::-1])[:5].tolist()
        
    df_submit["id"] = ids
    df_submit["country"] = cts
    df_submit.to_csv('submission.csv', index = False)
    print("kaggle submission in process ...")
    ! kaggle competitions submit -c airbnb-recruiting-new-user-bookings -f submission.csv -m "Message"
    
    gc.collect()
    
    return model, df_train, df_test, target
%%time
model, df_train, df_test, target = submit_kaggle(train, test, target, 
                                                      reg_alpha=1, 
                                                      reg_lambda=0, 
                                                      learning_rate=0.05, 
                                                      n_estimators=400)
cross validation started ...
0.26659212543108246

model fitting starting ...
model fitting completed ...

kaggle submission in process ...
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/jk/.kaggle/kaggle.json'
Successfully submitted to Airbnb New User BookingsCPU times: user 11min 19s, sys: 8.83 s, total: 11min 27s
Wall time: 11min 7s

feature importance

print("feature importance mean :", round(np.mean(model.feature_importances_), 2))
print("feature importance median :", round(np.median(model.feature_importances_), 2))

lgb.plot_importance(model, figsize=(50, 50), max_num_features=50)
plt.show()
feature importance mean : 148.07
feature importance median : 0.0

feature selection by feature importance

model1 = SelectFromModel(model, prefit=True, threshold=1)
X_train = model1.transform(df_train)
X_test = model1.transform(df_test)

X_train.shape, X_test.shape, target.shape
((213451, 451), (62096, 451), (213451,))
%%time
model, df_train, df_test, target = submit_kaggle(X_train, X_test, target, 
                                                      reg_alpha=1, 
                                                      reg_lambda=0, 
                                                      learning_rate=0.1, 
                                                      n_estimators=100)
cross validation started ...
print("feature importance mean :", round(np.mean(model.feature_importances_), 2))
print("feature importance median :", round(np.median(model.feature_importances_), 2))

lgb.plot_importance(model, figsize=(50, 50), max_num_features=50)
plt.show()
#0.87814 #top22% #326/1462

© 2018. All rights reserved.

Powered by Hydejack v7.5.0