In this Notebook we're going to demonstrate the typical steps a responsible Data Scientist can take to understand and prepare their Data.
#Import Product DataSet here
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3
def __iter__(self): return 0
# @hidden_cell
# The following code accesses a file in a IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_5ae689904bd0473ea24ad0236f6cf6bb = ibm_boto3.client(service_name='s3',
ibm_api_key_id='BDofu6biX9qVa-nFQ-cf3avUI_ZYjZQqbkp8B7Hg7NKu',
ibm_auth_endpoint="https://iam.eu-gb.bluemix.net/oidc/token",
config=Config(signature_version='oauth'),
endpoint_url='https://s3.eu-geo.objectstorage.service.networklayer.com')
body = client_5ae689904bd0473ea24ad0236f6cf6bb.get_object(Bucket='project1-donotdelete-pr-lawgcttpb9ne6t',Key='Product Data Set - Student 2 of 3.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )
product_data = pd.read_csv(body,sep='|')
product_data.head()
#Import Transaction DataSet Here
body = client_5ae689904bd0473ea24ad0236f6cf6bb.get_object(Bucket='project1-donotdelete-pr-lawgcttpb9ne6t',Key='Transaction Data Set - Student 3 of 3.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )
transaction_data = pd.read_csv(body)
transaction_data.head()
#Import Customer Dataset Here
body = client_5ae689904bd0473ea24ad0236f6cf6bb.get_object(Bucket='project1-donotdelete-pr-lawgcttpb9ne6t',Key='Customer Data Set - Student 1 of 3.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )
customer_data = pd.read_csv(body)
customer_data.head()
product_data.shape
transactions_data.shape
customer_data.shape
type(customer_data)
type(customer_data.AGE)
customer_data.dtypes
customer_data['INCOME']=customer_data['INCOME'].map(lambda x : x.replace('$',''))
customer_data.head(2)
customer_data['INCOME']=customer_data['INCOME'].map(lambda x : int(x.replace(',','')))
customer_data.head(2)
customer_data.dtypes
Now running the “dtypes” method reveals that data type conversion of INCOME was successful
customer_data["MARITAL STATUS"].describe()
customer_data["INCOME"].describe()
customer_data["MARITAL STATUS"].unique()
from datetime import datetime
customer_data['ENROLLMENT DATE']=\
customer_data['ENROLLMENT DATE'][customer_data['ENROLLMENT DATE'].notnull()].\
map(lambda x :datetime.strptime(x, '%d-%m-%Y') )
customer_data.dtypes
Data used in this tutorial is mostly free from data quality issues, however in real life, data scientists deal with data sets that needs to be cleaned and corrected for their quality issues
print('null values for transactoins ?',transactions_data.isnull().values.any())
print('null values for products ?',product_data.isnull().values.any())
print('null values for customers ?',customer_data.isnull().values.any())
customer_data.columns[customer_data.isna().any()].tolist()
It turned out that ENROLMENT DATE is the only column which has null values. The reasons behind is that not all customers are enrolled to loyalty and hence there is no enrolment date
import matplotlib.pyplot as plt
customer_data['MARITAL STATUS'].value_counts().plot(kind='bar')
plt.xlabel("Marital Status")
plt.ylabel("Frequency Distribution")
plt.show()
customer_data['AGE'].hist(bins=10)
plt.show()
plt.figure(figsize=(8,8))
plt.boxplot(customer_data.AGE,0,'rs',1)
plt.grid(linestyle='-',linewidth=1)
plt.show()
customer_data['AGE'].describe()
Remember our business understanding objectives 1-Understanding the factors associated with loyalty program participation 2-Understanding the factors associated with increased spending
trans_products=transactions_data.merge(product_data,how='inner', left_on='PRODUCT NUM', right_on='PRODUCT CODE')
trans_products.head()
trans_products['UNIT LIST PRICE']=trans_products['UNIT LIST PRICE'].map(lambda x : float(x.replace('$','')))
trans_products.dtypes
trans_products['Total_Price']=trans_products['QUANTITY PURCHASED'] * trans_products['UNIT LIST PRICE'] * (1- trans_products['DISCOUNT TAKEN'])
trans_products.head()
Income_by_product = trans_products.groupby('PRODUCT CATEGORY').agg({'Total_Price':'sum'}).sort_values('Total_Price',ascending=False)
Income_by_product
Revenue_by_product=Income_by_product.rename(columns={'Total_Price':'Revenue Per Product'})
Revenue_by_product['Revenue Per Product'].plot(kind='pie',autopct='%1.1f%%',legend = True)
customer_prod_categ=trans_products.groupby(['CUSTOMER NUM','PRODUCT CATEGORY']).agg({'Total_Price':'sum'})
customer_prod_categ.head()
customer_prod_categ.columns
customer_prod_categ.reset_index().head()
customer_prod_categ=customer_prod_categ.reset_index()
customer_pivot=customer_prod_categ.pivot(index='CUSTOMER NUM',columns='PRODUCT CATEGORY',values='Total_Price')
customer_pivot.head()
trans_products['TRANSACTION DATE']=trans_products['TRANSACTION DATE'].map(lambda x :datetime.strptime(x, '%m/%d/%Y') )
recent_trans_total_spend=trans_products.groupby('CUSTOMER NUM').\
agg({'TRANSACTION DATE':'max','Total_Price':'sum'}). \
rename(columns={'TRANSACTION DATE':'RECENT TRANSACTION DATE','Total_Price':'TOTAL SPENT'})
recent_trans_total_spend.head()
customer_KPIs=customer_pivot.merge(recent_trans_total_spend,how='inner',left_index=True, right_index=True )
customer_KPIs.head()
customer_KPIs=customer_KPIs.fillna(0)
customer_KPIs.head()
customer_all_view=customer_data.merge(customer_KPIs,how='inner', left_on='CUSTOMERID', right_index=True)
customer_all_view.head()
table=pd.crosstab(customer_all_view['GENDER'],customer_all_view['LOYALTY GROUP'])
table
table.plot(kind='bar', stacked=True,figsize=(6,6))
plt.show()
table=pd.crosstab(customer_all_view['EXPERIENCE SCORE'],customer_all_view['LOYALTY GROUP'])
table
table.plot(kind='bar', stacked=True,figsize=(6,6))
plt.show()
table=pd.crosstab(customer_all_view['MARITAL STATUS'],customer_all_view['LOYALTY GROUP'])
table.plot(kind='bar', stacked=True,figsize=(6,6))
plt.show()
customer_all_view['AGE_BINNED'] = pd.cut(customer_all_view['AGE'],10) # 10 bins of age
customer_all_view['AGE_BINNED'].value_counts()
table=pd.crosstab(customer_all_view['AGE_BINNED'],customer_all_view['LOYALTY GROUP'])
table.plot(kind='bar', stacked=True,figsize=(6,6))
plt.show()
customer_all_view.groupby("LOYALTY GROUP").agg({'AGE':'mean'})
fig = plt.figure(1, figsize=(9, 6))
ax = fig.add_subplot(111)
plot1=customer_all_view['AGE'][customer_all_view['LOYALTY GROUP'] == "enrolled"]
plot2=customer_all_view['AGE'][customer_all_view['LOYALTY GROUP'] == "notenrolled"]
list1=[plot1,plot2]
ax.boxplot(list1,0,'rs',1)
ax.set_xticklabels(['Enrolled', 'Not Enrolled'])
plt.grid( linestyle='-', linewidth=1)
plt.show()
customer_all_view['TOTAL SPENT BINNED'] = pd.cut(customer_all_view['TOTAL SPENT'],10) # 10 bins of age
table=pd.crosstab(customer_all_view['TOTAL SPENT BINNED'],customer_all_view['LOYALTY GROUP'])
table.plot(kind='bar', stacked=True,figsize=(6,6))
plt.show()
plt.scatter(customer_all_view['AGE'],customer_all_view['TOTAL SPENT'])
plt.xlabel("AGE")
plt.ylabel("Total Spent")
plt.show()
from scipy.stats import pearsonr
pearsonr(customer_all_view['AGE'],customer_all_view['TOTAL SPENT'])
plt.scatter(customer_all_view['INCOME'],customer_all_view['TOTAL SPENT'])
plt.xlabel("Income")
plt.ylabel("Total Spent")
plt.show()
pearsonr(customer_all_view['INCOME'],customer_all_view['TOTAL SPENT'])
table = customer_all_view.groupby(['EXPERIENCE SCORE']).agg({'TOTAL SPENT':'mean'}).reset_index()
table['TOTAL SPENT'].plot(kind='bar')
plt.xlabel("Experience Score")
plt.ylabel("Average Total Spent per Score")
plt.xticks([0,1,2,3,4,5,6,7,8,9],[1,2,3,4,5,6,7,8,9,10])
plt.show()