In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
df1 = pd.read_csv('/Users/atma6951/Documents/code/pychakras/pychakras/udemy_ml_bootcamp/Python-for-Data-Visualization/Pandas Built-in Data Viz/df1', index_col=0)
df2 = pd.read_csv('/Users/atma6951/Documents/code/pychakras/pychakras/udemy_ml_bootcamp/Python-for-Data-Visualization/Pandas Built-in Data Viz/df2')
In [3]:
df1.head()
Out[3]:
In [4]:
df2.head()
Out[4]:
df.plot()
and specify the plot type, the X and Y columns etcdf.plot.hist()
calling plot in OO fashion. Only specify teh X and Y and color or size columnsdf['column'].plot.plotname()
- calling plot on a seriesTypes of plot that can be called: area, bar, line, scatter, box, hexbin, kde etc.
In [8]:
df1.plot(x='A', kind='hist')
Out[8]:
In [10]:
df1['A'].plot.hist(bins=30)
Out[10]:
In [7]:
df1.hist()
Out[7]:
In reality, you have a lot more columns. You can prettify the above by creating a layout and figsize:
In [8]:
ax_list = df1.hist(bins=25, layout=(2,2), figsize=(7,7))
plt.tight_layout()
In [14]:
ax_list = df1.hist(bins=25, sharex=True, sharey=True, layout=(1,4), figsize=(15,4))
In [15]:
ax_list = df1.hist(bins=25, sharex=True, sharey=True, layout=(2,2), figsize=(8,8))
In [13]:
plt.style.use('dark_background')
df2.plot.area()
Out[13]:
In [14]:
plt.style.use('fivethirtyeight')
df2.plot.bar()
Out[14]:
In [18]:
#reset the style
plt.style.use('default')
# pass figsize to the matplotlib backend engine and `lw` is line width
df1.plot.line(x=df1.index, y='A', figsize=(12,2), lw=1)
Out[18]:
In [21]:
df1.plot.scatter(x='A', y='B',c='C', cmap='coolwarm')
Out[21]:
In [22]:
# you could specify size s='c' however the points come out tiny.
# had to scale it by 100, hence using actual series data and not the column name
df2.plot.scatter(x='a',y='b', s=df2['c']*100)
Out[22]:
In [23]:
df1['A'].plot.kde()
Out[23]:
Visualize the density of all columns in one plot
In [24]:
df1.plot.kde()
Out[24]:
In [26]:
df2.plot.density() #I think density is an alias to KDE
Out[26]:
Word clouds are a great way to visualize frequency of certain terms that appear in the data set. This is accomplished using the library wordcloud. You can install it as
conda install -c conda-forge wordcloud
In [2]:
registrant_df = pd.read_csv('./registrant.csv')
registrant_df.head()
Out[2]:
Now, let us plot the responses from the column What would you like to learn?
as a word cloud. First, we need to turn the series into a paragraph.
In [5]:
obj_series = registrant_df['What would you like to learn?'].dropna()
obj_list = list(obj_series)
obj_string = ' '.join(obj_list)
obj_string
Out[5]:
In [12]:
from wordcloud import WordCloud
wc = WordCloud(width=1000, height=600, background_color='white')
In [13]:
obj_wc_img = wc.generate_from_text(obj_string)
In [14]:
plt.figure(figsize=(20,10))
plt.imshow(obj_wc_img, interpolation="bilinear")
plt.axis('off')
plt.title('What would you like to learn?');