It will provide the column names and some basic information about each column. It also provides min, max, and mean for each column. Finally it provides a plot of all columns against each other.
As you can see, there are no missing values, and the has_profile and has_pic are boolean columns.
In [26]:
import numpy as np
import numpy.random as npr
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pandas as pd
def expAnalysis(data, colNumber):
dimensions = data.shape
print 'The dimensions are height: ' + str(dimensions[0]) + ' width: ' + str(dimensions[1])
print 'The column names are: '
for i in range(0, dimensions[1]):
print( ' ' + data.columns[i])
print 'Information about the columns'
numcols = []
for i in range(0, dimensions[1]):
print(' ' + data.columns[i] + ' is ' + str( (data.ix[:,i]).dtype) )
print(' unique values: ' + str(len(set(data.ix[:,i]))))
print(' missing values: ' + str(sum(pd.isnull(data.ix[:,i]))) )
if (data.ix[:,i].dtype == dtype(int64) or data.ix[:,i].dtype == dtype(float64) ):
numcols.append(i)
print(' min: ' + str(min(data.ix[:,i])))
print(' max: ' + str(max(data.ix[:,i])))
print(' mean: ' + str(np.mean(data.ix[:,i])))
pd.scatter_matrix(data)
expAnalysis(pd.io.parsers.read_csv('twitter_user_data_data.csv'), 10)
In [27]:
expAnalysis(pd.io.parsers.read_csv('twitter_user_data.csv'), 10)
In [28]:
expAnalysis(pd.io.parsers.read_csv('twitter_user_datascience_data.csv'), 10)
In [29]:
twitter_data = pd.io.parsers.read_csv('twitter_user_data_data.csv')
# set the random number generator seed
random.seed(32835)
nrows = twitter_data.shape[0]
rows = range(nrows)
random.shuffle(rows)
split_point = int(nrows * .60)
train_rows = rows[:split_point]
test_rows = rows[split_point:]
train_index = twitter_data.index[train_rows]
test_index = twitter_data.index[test_rows]
training_data = twitter_data.ix[train_index, :]
test_data = twitter_data.ix[test_index, :]
training_data.to_csv('twitter_user_data_data_training.csv', index=False)
test_data.to_csv('twitter_user_data_data_test.csv', index=False)
In [29]: