import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
wine_df = pd.read_csv("wine.csv")
wine_df.head() #look at the head of our df
print('Shape\n',wine_df.shape)
print('--------------------------------------------------------------------------')
print('Number of rows\n', len(wine_df))
print('--------------------------------------------------------------------------')
print('Column headers\n', wine_df.columns)
print('--------------------------------------------------------------------------')
print('Data types\n', wine_df.dtypes)
print('--------------------------------------------------------------------------')
print('Index\n', wine_df.index)
print('--------------------------------------------------------------------------')
print('Missing Data\n',wine_df.isnull().sum())
print('--------------------------------------------------------------------------')
print('Duplicated Rows\n', wine_df.duplicated().sum())
print('--------------------------------------------------------------------------')
#droping the extra index column.
wine_df.drop('Unnamed: 0',1, inplace=True)
#delete the rows that have identical values in each columns
wine_df.drop_duplicates(inplace=True)
wine_df.loc[:,['country','designation', 'price']].sort_values('price', ascending=False).head().reset_index()
Wine Spectator tasters review wines on the following 100-point scale:
https://www.winespectator.com/articles/scoring-scale
SO WE'RE GOING TO GROUP THE POINTS COLUMN ACCORDINGLY
wine_df.points.describe()
wine_df['scoring_scale'] = pd.cut(wine_df.points, (84,89,94,100), labels=['very_good', 'outstanding', 'classic'])
wine_df.loc[wine_df.price > 200,['designation','scoring_scale']].head(10)
print('not too bad')
wine_df.groupby("scoring_scale").designation.count()
sns.countplot(x='scoring_scale', data=wine_df)
wine_df.loc[wine_df['scoring_scale'] == 'classic', ['country','designation', 'price','variety','points']].head()
a = '''this is a list of over 10,000 Fast Food restaurants provided by Datafiniti's Business Database. The dataset includes the restaurant's address, city, latitude and longitude coordinates, name, and more. Note that this is a sample of a large dataset The dataset contains 8 Columns and 10000 rows, the columns are mostly of Categorical variables except for longitude and latitude columns which are Continuous variables. See that the Dataset did not come with a Land Area Column I have decided to bring in another Dataset that has Land Area data and join it to our original Dataset this will help us to determine which state has the most Fast Food per meter or mile square. Some of the Questions discuss in the analysis include but not limited: - States with the most and least McDonald's per capita - Fast Food restaurants per capita for all states - Fast Food restaurants with the most locations nationally - States with the most and least Fast Food restaurants per capita - Cities with the most Fast Food restaurants per capita - States with the most and least Fast Food restaurants per capita - The number of Fast Food restaurants per capita We have taken full advantage of the Longitude and Latitude data to generate some informative plot geographically. '''
b = a.strip()
print(b)
sns.distplot(wine_df.price.dropna()) # WE KNOW THAT WE HAVE SOME MISSING VALUES IN PRICE COLUMN SO WE DROP THOSE MISSING VALUES FIRST
top_pro = wine_df.country.value_counts()
top_5_pro = top_pro.iloc[0:5]
top_5_pro.plot(kind='barh', color=['r','b','g','r','y'])
## MOST TOP 5 WINE VARIETIES
top_5_variety = wine_df.variety.value_counts().head()
top_5_variety.plot(kind='barh', color=['r', 'w','r','r','r'], edgecolor = 'black')
wine_df.isnull().sum()
ALTHOUGH THERE'S A GOOD AMOUNT OF MISSING VALUE, IT DID NOT AFFECT GREATLY OUR EDA.
but if we had to fix the columns with missing values I think K nearest neighbor will appropriate in this case. because, columns like region_1 and region_2 have a very close relationship with the country, province, description and some other coolums. however for the missing value in price column I was thinking of interpolate() method of filling or using the average price of their scoring_scale.
# LET'S GENERATE SOME WORD CLOUD FOR PROVINCES AND VARIETIES
province = list(wine_df.province) #CREATE A LIST OF PROVINCES
province = ' '.join(province) # JOIN THE PROVINCES TO GET ONE STRING
plt.figure(figsize=(12,18)) #SET THE CANVAS TO SUIT OUR VIZ
wordcloud = WordCloud().generate(province) #CREATE A WORDCLOUD AND LET THE MODULE DECIDE THE WIDTH AND HEIGHT
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.margins(x=0, y=0)
variety = list(wine_df.variety)
variety = ' '.join(variety)
plt.figure(figsize=(12,18))
wordcloud = WordCloud(background_color='white').generate(variety)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.margins(x=0, y=0)