In [ ]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

Real data is messy

Titanic


In [ ]:
import csv as csv

csv_file_object = csv.reader(open('titanic_train.csv', 'rb'))

In [ ]:
header = csv_file_object.next()
print(header)

In [ ]:
lines = [line for line in csv_file_object]

In [ ]:
print(lines[0])

In [ ]:
print(lines[1])

Use pandas!


In [ ]:
import pandas as pd
titanic_train = pd.read_csv("titanic_train.csv")

In [ ]:
titanic_train

Community health status from data.gov


In [ ]:
community_leading_cause = pd.read_csv("LEADINGCAUSESOFDEATH.csv")
community_leading_cause
-9999 Indicate N.A. value from the source data for the Unemployed column on the VUNERABLEPOPSANDENVHEALTH page
-2222 or -2222.2 or -2 nda, no data available, see Data Notes document for details
-1111.1 or -1111 or -1 nrf, no report, see Data Notes document for details

Twitter user locations


In [ ]:
users = pd.read_csv("users.csv")

In [ ]:
users

In [ ]:
print("Number of users: %d" % len(users))
print("Unique locations: %d" % len(users.location.value_counts()))

In [ ]:
users.location.value_counts()

In [ ]: