In [ ]:
%pylab inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
Lets create a data frame of a column made up of 1's and 0's and another categorical column.
In [ ]:
# Class label would be categorical variable derived from binning the continuous column
x = ['Class1']*300 + ['Class2']*400 + ['Class3']*300
# Column of random 0s and 1s
y = np.random.choice([0,1], 1000)
# Dataframe from the above variables
df = pd.DataFrame({'Class':x, 'N':y})
Now, lets create histograms of the N column but using the class column as a grouping, using the 'by' param in hist():
In [36]:
# From this grouping, plot histograms
plts = df['N'].hist(by=df['Class'])
OK, lets weigh the creation of the binary column, using p in random.choice():
In [44]:
x = ['Class1']*300 + ['Class2']*400 + ['Class3']*300
y = np.random.choice([0,1], 1000, p=[0.25, 0.75])
df = pd.DataFrame({'Class':x, 'N':y})
# grouped = df.groupby('Class')
plts = df['N'].hist(by=df['Class'])
OK, but what about using a continuous variable? We can use pandas.cut() to bin the continuous variable:
In [45]:
# Random x data: values from 0 - 9
x = np.random.rand(1000) * 9
# Here we bin the continuous x variable into bins (I set the end points to be from)
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html
bins = pd.cut(x, [0, 3, 6, 9])
bins
Out[45]:
Lets create another dataframe using a binary column and the binning from above:
In [42]:
# Column of random 0s and 1s
y = np.random.choice([0,1], 1000)
# Data frame made from column of 0s and 1s and the other column the categorical binning of the continuous x data
df = pd.DataFrame({'y':y, 'Class': bins})
plts = df['y'].hist(by=df['Class'])
In [ ]:
In [43]:
# Column of random 0s and 1s, weighed
y = np.random.choice([0,1], 1000, p = [0.25, 0.75])
# Data frame made from column of 0s and 1s and the other column the categorical binning of the continuous x data
df = pd.DataFrame({'y':y, 'Class': bins})
plts = df['y'].hist(by=df['Class'])