06_pandas_useful

06_pandas_useful_code

# data is a DataFrame type

data.sample(nums)     # 随机取nums个值

data.col.unique()        # 返回col取的所有值

#对于变量（不论连续或者离散或者类型变量）， 得到其col的取值直方图
fig = data.loan_amnt.hist(bins=50) 　　# loan_amnt is a col in data
fig.set_title('loan Amount hist')
fig.set_xlabel('loan Amount')
fig.set_ylabel('Number of Loans')

data.open_acc.dropna().unique() # col dropna() 删去所有含有缺失值的

np.where(binary_array, 1, 0) # binary_array中为True的用1代替，其他的用0代替

data['defaulted'] = np.where(data.loan_status.isin(['Default']), 1, 0)

data.loan_status.isin(['Default']) # 判断loan_status是否是‘Default’，返回列表

data['col'].value_counts()

# for each category of home ownership

fig = data['home_ownership'].value_counts().plot.bar()
fig.set_title('Home Ownership')
fig.set_ylabel('Number of customers')

data.groupby([col])['y'].sum()

缺失值

data.isnull().sum() # 计算每个col中为null的数目

data.isnull().mean() #计算每个col中null数据所占比例

data['cabin_null'] = np.where(data.Cabin.isnull(), 1, 0)

data.groupby(['Survived'])['cabin_null'].mean()

data['emp_length_redefined'] = data.emp_length.map(length_dict)

data.emp_length_redefined.unique()

data[data.emp_title.isnull()].groupby(['emp_length_redefined'])['emp_length'].count().sort_values() / value

Outliers

import seaborn as sns
sns.distplot(data.Age)

# another way of visualising outliers is using boxplots and whiskers,
# which provides the quantiles (box) and inter-quantile range (whiskers),
# with the outliers sitting outside the error bars (whiskers).

# All the dots in the plot below are outliers according to the quantiles + 1.5 IQR rule

fig = data.boxplot(column='Fare')
fig.set_title('')
fig.set_xlabel('Survived')
fig.set_ylabel('Fare')

# let's look at the values of the quantiles so we can
# calculate the upper and lower boundaries for the outliers

# 25%, 50% and 75% in the output below indicate the
# 25th quantile, median and 75th quantile respectively

data.Fare.describe()

# Let's calculate the upper and lower boundaries
# to identify outliers according
# to interquantile proximity rule

IQR = data.Fare.quantile(0.75) - data.Fare.quantile(0.25)

Lower_fence = data.Fare.quantile(0.25) - (IQR * 1.5)
Upper_fence = data.Fare.quantile(0.75) + (IQR * 1.5)

Upper_fence, Lower_fence, IQR

multiple_tickets = pd.concat(
    [
        high_fare_df.groupby('Ticket')['Fare'].count(),
        high_fare_df.groupby('Ticket')['Fare'].mean()
    ],
    axis=1)

multiple_tickets.columns = ['Ticket', 'Fare']
multiple_tickets.head(10)

Rare Values

 Let's make a combined plot of the label frequency and
# the time to pass testing.
# This will help us  visualise the relationship between the
# target and the labels of X3

fig, ax = plt.subplots(figsize=(8, 4))
plt.xticks(temp_df.index, temp_df['X3'], rotation=0)

ax2 = ax.twinx()
ax.bar(temp_df.index, temp_df["X3_perc_cars"], color='lightgrey')
ax2.plot(temp_df.index, temp_df["y"], color='green', label='Seconds')
ax.set_ylabel('percentage of cars per category')
ax2.set_ylabel('Seconds')

# let's automate the above process for all the categorical variables

for col in cols_to_use:
    # calculate the frequency of the different labels in the variable
    temp_df = pd.Series(data[col].value_counts() / total_cars).reset_index()

    # rename the columns
    temp_df.columns = [col, col + '_perc_cars']

    # merge onto the mean time to pass the test
    temp_df = temp_df.merge(
        data.groupby([col])['y'].mean().reset_index(), on=col, how='left')

    # plot the figure as shown above
    fig, ax = plt.subplots(figsize=(8, 4))
    plt.xticks(temp_df.index, temp_df[col], rotation=0)
    ax2 = ax.twinx()

    ax.bar(
        temp_df.index,
        temp_df[col + '_perc_cars'],
        color='lightgrey',
        label=col)

    ax2.plot(
        temp_df.index,
        temp_df["y"],
        color='green',
    )

    ax.set_ylabel('percentage of cars per category')
    ax2.set_ylabel('Seconds')
    ax.legend()
    plt.show()

# let's automate the replacement of infrequent categories
# by the label 'rare' in the remaining categorical variables

# I start from 1 because I already replaced the first variable in
# the list
for col in cols_to_use[1:]:
    
    # calculate the % of cars in each category
    temp_df = pd.Series(data[col].value_counts() / total_cars)

    # create a dictionary to replace the rare labels with the
    # string 'rare'
    grouping_dict = {
        k: ('rare' if k not in temp_df[temp_df >= 0.1].index else k)
        for k in temp_df.index
    }
    
    # replace the rare labels
    data[col + '_grouped'] = data[col].map(grouping_dict)

data.head()

# In order to use this variables to build machine learning using sklearn
# first we need to replace the labels by numbers.

# The correct way to do this, is to first separate into training and test
# sets. And then create a replacing dictionary using the train set
# and replace the strings both in train and test using the dictionary
# created.

# This will lead to the introduction of missing values / NaN in the
# test set, for those labels that are not present in the train set
# we saw this effect in the previous lecture

# in the section dedicated to rare values later in the course, I will
# show you how to avoid this problem

# now, in order to speed up the demonstration, I will replace the
# labels by strings in the entire dataset, and then divide into
# train and test. 
# but remember: THIS IS NOT GOOD PRACTICE!

# original variables
for col in cols_to_use:
    # create the dic and replace the strings in one line
    data.loc[:, col] = data.loc[:, col].map(
        {k: i
         for i, k in enumerate(data[col].unique(), 0)})

# variables with grouped categories
for col in ['X1_grouped', 'X6_grouped', 'X3_grouped', 'X2_grouped']:
    # create the dic and replace the strings in one line
    data.loc[:, col] = data.loc[:, col].map(
        {k: i
         for i, k in enumerate(data[col].unique(), 0)})

# let's capture the first letter
data['Cabin_reduced'] = data['Cabin'].astype(str).str[0]

# Now I will replace the letters in the reduced cabin variable

# create replace dictionary
cabin_dict = {k: i for i, k in enumerate(X_train['Cabin_reduced'].unique(), 0)}

# replace labels by numbers with dictionary
X_train.loc[:, 'Cabin_reduced'] = X_train.loc[:, 'Cabin_reduced'].map(cabin_dict)
X_test.loc[:, 'Cabin_reduced'] = X_test.loc[:, 'Cabin_reduced'].map(cabin_dict)