In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt# Load data
from sklearn import linear_model
from sklearn.cluster import KMeans as km
import statsmodels.api as sm
%matplotlib inline
city_data = pd.read_csv('USData_ClassProject1.csv')
In [16]:
x = city_data.PercentWhite
y = city_data.MedianHouseholdIncome
x = sm.add_constant(x)
clf = linear_model.LinearRegression()
clf.fit(x,y)
y_hat = clf.predict(x)
print "R^2: ", clf.score(x,y)
def cluster(X):
clf.fit(X)
return clf.predict(X)
clf,X = km(3), city_data[['PercentWhite','MedianHouseholdIncome']]
X.PercentWhite = (X.PercentWhite - np.mean(X.PercentWhite))/np.std(X.PercentWhite)
X.MedianHouseholdIncome = (X.MedianHouseholdIncome - np.mean(X.MedianHouseholdIncome))/np.std(X.MedianHouseholdIncome)
plt.scatter(city_data.PercentWhite, city_data.MedianHouseholdIncome, c=cluster(X), s=50)
plt.plot(x.PercentWhite, y_hat, 'b', alpha=0.9,color="white")
plt.ylabel('Median Household Income')
plt.xlabel('Percent White')
Out[16]:
In [ ]:
In [ ]:
X.PercentWhite = (X.PercentWhite - np.mean(X.PercentWhite))/np.std(X.PercentWhite)
X.MedianHouseholdIncome = (X.MedianHouseholdIncome - np.mean(X.MedianHouseholdIncome))/np.std(X.MedianHouseholdIncome)
plt.scatter(rich_people.PercentWhite, rich_people.MedianHouseholdIncome, c=cluster(X), s=50)
plt.ylabel('Median Household Income')
plt.xlabel('Percent White')
In [ ]:
print "Mean: " + str(np.mean(city_data.PercentWhite))
print "Median: " + str(np.median(city_data.PercentWhite))
print "Median Population: " + str(np.median(city_data.TotalPopulation))
print "Mean Popluation: " + str(np.mean(city_data.TotalPopulation))
In [ ]:
rich_people = city_data[['City','State','MedianHouseholdIncome','PercentWhite', 'TotalPopulation']][city_data.MedianHouseholdIncome >= 150000]
print "Mean white: " + str(np.mean(rich_people.PercentWhite))
print "Median white: " + str(np.median(rich_people.PercentWhite))
rich_people
In [ ]:
x = rich_people.PercentWhite
y = rich_people.MedianHouseholdIncome
x = sm.add_constant(x)
clf = linear_model.LinearRegression()
clf.fit(x,y)
y_hat = clf.predict(x)
plt.scatter(rich_people.PercentWhite,rich_people.MedianHouseholdIncome,color="Black")
plt.plot(x.PercentWhite, y_hat, 'b', alpha=0.9)
print "R^2: ", clf.score(x,y)
In [ ]:
def cluster(X):
clf.fit(X)
return clf.predict(X)
clf,X = km(2), rich_people[['PercentWhite','MedianHouseholdIncome']]
plt.scatter(rich_people.PercentWhite, rich_people.MedianHouseholdIncome, c=cluster(X), s=50)
plt.ylabel('Median Household Income')
plt.xlabel('Percent White')
In [ ]:
X.PercentWhite = (X.PercentWhite - np.mean(X.PercentWhite))/np.std(X.PercentWhite)
X.MedianHouseholdIncome = (X.MedianHouseholdIncome - np.mean(X.MedianHouseholdIncome))/np.std(X.MedianHouseholdIncome)
plt.scatter(rich_people.PercentWhite, rich_people.MedianHouseholdIncome, c=cluster(X), s=50)
plt.ylabel('Median Household Income')
plt.xlabel('Percent White')
In [ ]:
well_off_people = city_data[['City','MedianHouseholdIncome','PercentWhite', 'TotalPopulation']][city_data.MedianHouseholdIncome >= 100000]
print "Mean white: " + str(np.mean(well_off_people.PercentWhite))
print "Median white: " + str(np.median(well_off_people.PercentWhite))
well_off_people
In [ ]:
x = well_off_people.PercentWhite
y = well_off_people.MedianHouseholdIncome
x = sm.add_constant(x)
clf = linear_model.LinearRegression()
clf.fit(x,y)
y_hat = clf.predict(x)
plt.scatter(well_off_people.PercentWhite,well_off_people.MedianHouseholdIncome,color="Black")
plt.plot(x.PercentWhite, y_hat, 'b', alpha=0.9)
print "R^2: ", clf.score(x,y)
In [ ]:
very_white_people = city_data[['City','MedianHouseholdIncome','PercentWhite', 'TotalPopulation']][city_data.PercentWhite >= 95]
print "Mean income: " + str(np.mean(very_white_people.MedianHouseholdIncome))
print "Median income: " + str(np.median(very_white_people.MedianHouseholdIncome))
very_white_people
In [ ]:
x = very_white_people.PercentWhite
y = very_white_people.MedianHouseholdIncome
x = sm.add_constant(x)
clf = linear_model.LinearRegression()
clf.fit(x,y)
y_hat = clf.predict(x)
plt.scatter(very_white_people.PercentWhite,very_white_people.MedianHouseholdIncome,color="Black")
plt.plot(x.PercentWhite, y_hat, 'b', alpha=0.9)
print "R^2: ", clf.score(x,y)
In [ ]:
X = city_data.PercentHispanic
y = city_data.EdHighSchoolPercent
x = sm.add_constant(X)
clf = linear_model.LinearRegression()
clf.fit(x,y)
y_hat = clf.predict(x)
plt.scatter(X,city_data.EdHighSchoolPercent,color="Black")
plt.plot(x.PercentHispanic, y_hat, 'b', alpha=0.9)
plt.xlabel("PercentHispanic percent")
plt.ylabel("Highschool percent")
print "R^2: ", clf.score(x,y)
In [ ]:
mean_of_city_percentage_of_people_with_no_collage = np.mean(city_data.EdHighSchoolPercent) + np.mean(city_data.EdElementaryPercent)
mean_of_city_percentage_of_people_with_no_collage
X = city_data.TotalPopulation
y = city_data.EdHighSchoolPercent
x = sm.add_constant(X)
clf = linear_model.LinearRegression()
clf.fit(x,y)
y_hat = clf.predict(x)
plt.scatter(X,city_data.EdHighSchoolPercent,color="Black")
plt.plot(x.TotalPopulation, y_hat, 'b', alpha=0.9)
plt.xlabel("Total percent")
plt.ylabel("Highschool percent")
print "R^2: ", clf.score(x,y)