In [119]:
from IPython.core.display import HTML

print("Setting custom CSS for the IPython Notebook")
styles = open('custom.css', 'r').read()
#HTML(styles)


Setting custom CSS for the IPython Notebook

In [1]:
## all imports
import numpy as np

import pandas as pd
from pandas import Series
from pandas import DataFrame
pd.options.display.mpl_style = 'default'

import matplotlib.pyplot as plt
%matplotlib inline  

import seaborn as sns
sns.set_context("talk")

CS109

Verena Kaynig-Fittkau

vkaynig@seas.harvard.edu

IPython Notebooks:

IPython Notebooks:


In [2]:
print "Hello CS109"

print "I love IPython"


Hello CS109
I love IPython

Todays topics:

  • get started with data
  • read in a data file
  • visualize it

How to load a table

  • we use Pandas for this
  • Pandas can do a lot more
  • more about it later


In [123]:
url = 'https://raw.githubusercontent.com/cs109/2014/master/lectures/wur2013.txt'
ranking = pd.read_table(url)
ranking = ranking[:3000]
N = 50
colors = np.random.rand(N)
area = colors = np.random.rand(N)
plt.scatter(ranking.iloc[:,3], ranking.iloc[:,7])
plt.show()


Visualization

  • Data is now in a Pandas DataFrame
  • Lets try a bar plot

Matplotlib


In [129]:
matplotlib.pyplot.xkcd()
data_to_plot = ranking.overall
plt.bar(data_to_plot.index, data_to_plot)
plt.show()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-129-75d5a357e7dd> in <module>()
----> 1 matplotlib.pyplot.xkcd()
      2 data_to_plot = ranking.overall
      3 plt.bar(data_to_plot.index, data_to_plot)
      4 plt.show()

NameError: name 'matplotlib' is not defined

Plotting with Pandas


In [128]:
data_to_plot.plot(kind='bar')
plt.show()


How to Rank Universities

  • Academic peer review (40%)
  • Faculty student ratio (20%)
  • Citations per faculty (20%)
  • Recruiter review (10%)
  • International orientation (10%)

http://en.wikipedia.org/wiki/QS_World_University_Rankings

Selecting a Subframe


In [130]:
relevant_columns = ['academic','faculty','citations',
                    'employer','international']
ranking_categories=ranking[relevant_columns]
ranking_categories.head()


Out[130]:
academic faculty citations employer international
0 100.0 100.0 99.7 100.0 97.6
1 100.0 99.3 100.0 100.0 94.1
2 100.0 99.6 95.8 100.0 95.5
3 99.9 98.9 95.6 98.7 96.5
4 99.9 99.8 92.5 100.0 99.9

Weighting categories


In [131]:
weights = [0.4,0.2,0.2,0.1,0.1]
ranking_categories_weighted = ranking_categories * weights
ranking_categories_weighted.head()


Out[131]:
academic faculty citations employer international
0 40.00 20.00 19.94 10.00 9.76
1 40.00 19.86 20.00 10.00 9.41
2 40.00 19.92 19.16 10.00 9.55
3 39.96 19.78 19.12 9.87 9.65
4 39.96 19.96 18.50 10.00 9.99

Multiple Bar Plot


In [136]:
ranking_categories_weighted.head().plot(kind='bar')
plt.show()


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-136-b055741f0567> in <module>()
----> 1 plt.xkcd()
      2 ranking_categories_weighted.head().plot(kind='bar')
      3 plt.show()

AttributeError: 'module' object has no attribute 'xkcd'

Quiz: Fix the Legend


In [11]:
ax = ranking_categories_weighted.head().plot(kind='bar', legend=False)

# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(0, 0))

plt.show()


Stacked Bar Plot


In [12]:
ranking_categories_weighted.head().plot(kind='bar', legend=False, stacked=True)
plt.show()


Horizontal Barplot


In [118]:
matplotlib.pyplot.xkcd()
ranking_categories_weighted.plot(kind='barh', 
                                 legend=False, stacked=True)
plt.show()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-118-74a03574413b> in <module>()
----> 1 matplotlib.pyplot.xkcd()
      2 ranking_categories_weighted.plot(kind='barh', 
      3                                  legend=False, stacked=True)
      4 plt.show()

NameError: name 'matplotlib' is not defined

Adding Labels


In [29]:



  File "<ipython-input-29-95d42bbc206d>", line 2
    ===
     ^
SyntaxError: invalid syntax

In [14]:
ranking.head().schoolname
ranking_categories.set_index(ranking.schoolname, inplace=True)

ranking_categories.plot(kind='barh', legend=False, stacked=True)
plt.show()


You Should Know Now

  • How to read in a data file
  • How to plot from a DataFrame
  • How to get help
  • Matplotlib is great
  • and can be painful

In [5]:
N = 50
colors = np.random.rand(N)
area = colors = np.random.rand(N)
ranking.iloc[:,3]
plt.scatter(ranking_categories_weighted.iloc[:,3], ranking_categories_weighted.iloc[:,3], s=1, c=colors, alpha=0.5)
plt.show()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-69e9902450f4> in <module>()
      2 colors = np.random.rand(N)
      3 area = colors = np.random.rand(N)
----> 4 ranking.iloc[:,3]
      5 plt.scatter(ranking_categories_weighted.iloc[:,3], ranking_categories_weighted.iloc[:,3], s=1, c=colors, alpha=0.5)
      6 plt.show()

NameError: name 'ranking' is not defined

In [14]:


In [14]:


In [14]:


In [15]:
titanic = sns.load_dataset("titanic")
titanic
titanic.age.hist(bins=20);

sns.lmplot("fare", "age", titanic, fit_reg=False);
sns.lmplot("fare", "age", hue="class", data=titanic, fit_reg=False)
sns.lmplot("fare", "age", hue="class", size="class", data=titanic, fit_reg=False)

sns.PairGrid(iris, vars=["sepal_length", "sepal_width"], hue="species")

fig = plt.figure()
ax = fig.add_subplot(1,1,1) # one row, one column, first plot
ax.set_title("At Bats vs. Hits.  Size = Home Runs")
ax.set_xlabel("At Bats")
ax.set_ylabel("Hits")
sns.jointplot(titanic.fare,titanic.age);
plt.scatter(titanic.fare, titanic.age, s=titanic.pclass, alpha=0.5)
#xlim(0, 700); ylim(0, 200)

from mpl_toolkits.mplot3d import Axes3D

fig2 = plt.figure()
ax2 = fig2.add_subplot(111, projection='3d')
ax2.scatter(titanic.age, titanic.fare, titanic.pclass)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-15-e76342419075> in <module>()
      5 sns.lmplot("fare", "age", titanic, fit_reg=False);
      6 sns.lmplot("fare", "age", hue="class", data=titanic, fit_reg=False)
----> 7 sns.lmplot("fare", "age", hue="class", size="class", data=titanic, fit_reg=False)
      8 
      9 sns.PairGrid(iris, vars=["sepal_length", "sepal_width"], hue="species")

/Library/Python/2.7/site-packages/seaborn/linearmodels.pyc in lmplot(x, y, data, hue, col, row, palette, col_wrap, size, aspect, sharex, sharey, hue_order, col_order, row_order, dropna, legend, legend_out, **kwargs)
    758                        size=size, aspect=aspect, col_wrap=col_wrap,
    759                        sharex=sharex, sharey=sharey,
--> 760                        legend=legend, legend_out=legend_out)
    761 
    762     # Hack to set the x limits properly, which needs to happen here

/Library/Python/2.7/site-packages/seaborn/axisgrid.pyc in __init__(self, data, row, col, hue, col_wrap, sharex, sharey, size, aspect, palette, row_order, col_order, hue_order, dropna, legend, legend_out, despine, margin_titles, xlim, ylim)
    228                                      squeeze=False,
    229                                      sharex=sharex, sharey=sharey,
--> 230                                      subplot_kw=subplot_kw)
    231             self.axes = axes
    232 

/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/pyplot.pyc in subplots(nrows, ncols, sharex, sharey, squeeze, subplot_kw, **fig_kw)
    864         subplot_kw = {}
    865 
--> 866     fig = figure(**fig_kw)
    867 
    868     # Create empty object array to hold all axes.  It's easiest to make it 1-d

/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/pyplot.pyc in figure(num, figsize, dpi, facecolor, edgecolor, frameon, FigureClass, **kwargs)
    341                                              frameon=frameon,
    342                                              FigureClass=FigureClass,
--> 343                                              **kwargs)
    344 
    345         if figLabel:

/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/backends/backend_agg.pyc in new_figure_manager(num, *args, **kwargs)
    384 
    385     FigureClass = kwargs.pop('FigureClass', Figure)
--> 386     thisFig = FigureClass(*args, **kwargs)
    387     canvas = FigureCanvasAgg(thisFig)
    388     manager = FigureManagerBase(canvas, num)

/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/figure.pyc in __init__(self, figsize, dpi, facecolor, edgecolor, linewidth, frameon, subplotpars)
    266         self.dpi_scale_trans = Affine2D()
    267         self.dpi = dpi
--> 268         self.bbox_inches = Bbox.from_bounds(0, 0, *figsize)
    269         self.bbox = TransformedBbox(self.bbox_inches, self.dpi_scale_trans)
    270 

/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/transforms.pyc in from_bounds(x0, y0, width, height)
    743         *width* and *height* may be negative.
    744         """
--> 745         return Bbox.from_extents(x0, y0, x0 + width, y0 + height)
    746 
    747     @staticmethod

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [12]:


In [38]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.image import BboxImage

from matplotlib._png import read_png
import matplotlib.colors
from matplotlib.cbook import get_sample_data

if 1:
    from matplotlib.transforms import Bbox, TransformedBbox
    from matplotlib.ticker import ScalarFormatter

    fig = plt.gcf()
    fig.clf()
    ax = plt.subplot(111)

    years = np.arange(2004, 2009)
    
    # --- changed this line --- #
    box_colors = sns.color_palette()
    #brewer2mpl.get_map('Set1', 'qualitative', 5).mpl_colors
    
    #heights = np.random.random(years.shape) * 10000 + 3000
    heights = {100, 10, 100, 10, 10}

    fmt = ScalarFormatter(useOffset=False)
    ax.xaxis.set_major_formatter(fmt)

    for year, h, bc in zip(years, heights, box_colors):
        # --- this is the line we changed --- #
        ax.bar(year-0.4, h, color=bc, linewidth=0)

        ax.annotate(r"%d" % (int(h/100.)*100),
                    (year, h+100), va="bottom", ha="center")


    ax.set_xlim(years[0]-0.5, years[-1]+0.5)
    ax.set_yscale('log')
    ax.set_ylim(1e1, 1e4)
    
    # --- Added this line --- #
    #ax.spines['top'].set_visible(False)
    #ax.spines['right'].set_visible(False)
    #ax.spines['left'].set_visible(False)
    
    # --- Added this line --- #
    ax.yaxis.set_ticks_position('none')
    ax.xaxis.set_ticks_position('none')
    
    #ax.grid(axis = 'y', color ='white', linestyle='-')
    
    #fig.savefig('ribbon_box_no_ribbons.png')
    plt.show()



In [29]:
import numpy as np
import pandas as pd
import seaborn as sns

x =  10**np.arange(1, 10)
y = 10**np.arange(1,10)*2
df1 = pd.DataFrame(data=y, index=x)
df2 = pd.DataFrame(data = {'x': x, 'y': y})

fgrid = sns.lmplot('x', 'y', df2)    
ax = fgrid.axes[0][0]
df1.plot(ax=ax)        

ax.set_xscale('log')
ax.set_yscale('log')



In [ ]: