notebook.community

Edit and run



In [300]:

    
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import math
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
import seaborn as sns
import scipy.stats as stats
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.decomposition import PCA as sklearn_pca
import locale
from locale import atof
import warnings
from IPython.display import display
from sklearn import linear_model
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.feature_selection import f_regression
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import xlrd



In [301]:

    
# Import FBI Raw Data

fbidata = pd.read_csv('https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/New_York_offenses/NEW_YORK-Offenses_Known_to_Law_Enforcement_by_City_2013%20-%2013tbl8ny.csv', delimiter=",", thousands=',',decimal=".")
fbiraw = pd.DataFrame(fbidata)
fbiraw.head()









    Out[301]:







  
    
      
      Table 8
      Unnamed: 1
      Unnamed: 2
      Unnamed: 3
      Unnamed: 4
      Unnamed: 5
      Unnamed: 6
      Unnamed: 7
      Unnamed: 8
      Unnamed: 9
      Unnamed: 10
      Unnamed: 11
      Unnamed: 12
    
  
  
    
      0
      NEW YORK
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1
      Offenses Known to Law Enforcement
      NaN
      NaN
      NaN
      NaN
      
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      2
      by City, 2013
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      3
      City
      Population
      Violent\ncrime
      Murder and\nnonnegligent\nmanslaughter
      Rape\n(revised\ndefinition)1
      Rape\n(legacy\ndefinition)2
      Robbery
      Aggravated\nassault
      Property\ncrime
      Burglary
      Larceny-\ntheft
      Motor\nvehicle\ntheft
      Arson3
    
    
      4
      Adams Village
      1,861
      0
      0
      NaN
      0
      0
      0
      12
      2
      10
      0
      0



In [302]:

    
#Transform FBI Raw Data
#Rename columns with row 3 from the original data set
fbiraw_t1 = fbiraw.rename(columns=fbiraw.iloc[3])

#Delete first three rows don´t contain data for the regression model
fbiraw_t2 = fbiraw_t1.drop(fbiraw_t1.index[0:4])



In [303]:

    
#Delete column "Rape (revised definition)1 as it contains no data
fbiraw_t2 = fbiraw_t2.drop('Rape\n(revised\ndefinition)1', axis = 1)



In [304]:

    
#Delete Arson Column as there is insufficient data
# 'The FBI does not publish arson data unless it receives data from either the agency or the state
#  for all 12 months of the calendar year.'
fbiraw_t2 = fbiraw_t2.drop('Arson3', axis = 1)



In [305]:

    
#Clean tail from the data set

#Re-shape dataset excluding the last 3 rows of the dataset as they don´t contain relevant information for the model
fbiraw_t2 = fbiraw_t2[:-3]

#Change names in Columns
fbiraw_t2= fbiraw_t2.rename(columns={'Violent\ncrime': 'Violent Crime', 'Murder and\nnonnegligent\nmanslaughter': 'Murder','Rape\n(legacy\ndefinition)2': 'Rape', 'Robbery': 'Robbery', 'Aggravated\nassault': 'Assault', 'Property\ncrime': 'PropertyCrime', 'Burglary': 'Burglary', 'Larceny-\ntheft': 'Larceny & Theft', 'Motor\nvehicle\ntheft': 'MotorVehicleTheft'})



In [306]:

    
#Analyse missing information
fbiraw_t2.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 348 entries, 4 to 351
Data columns (total 11 columns):
City                 348 non-null object
Population           348 non-null object
Violent Crime        348 non-null object
Murder               348 non-null object
Rape                 348 non-null object
Robbery              348 non-null object
Assault              348 non-null object
PropertyCrime        348 non-null object
Burglary             348 non-null object
Larceny & Theft      348 non-null object
MotorVehicleTheft    348 non-null object
dtypes: object(11)
memory usage: 17.7+ KB



In [307]:

    
#Change all columns from object to float
locale.setlocale(locale.LC_NUMERIC, '')
fbiraw_t2['Population'] = fbiraw_t2['Population'].apply(atof)
fbiraw_t2['Violent Crime'] = fbiraw_t2['Violent Crime'].apply(atof)
fbiraw_t2['Murder'] = fbiraw_t2['Murder'].apply(atof)
fbiraw_t2['Rape'] = fbiraw_t2['Rape'].apply(atof)
fbiraw_t2['Robbery'] = fbiraw_t2['Robbery'].apply(atof)
fbiraw_t2['Assault'] = fbiraw_t2['Assault'].apply(atof)
fbiraw_t2['PropertyCrime'] = fbiraw_t2['PropertyCrime'].apply(atof)
fbiraw_t2['Burglary'] = fbiraw_t2['Burglary'].apply(atof)
fbiraw_t2['Larceny & Theft'] = fbiraw_t2['Larceny & Theft'].apply(atof)
fbiraw_t2['MotorVehicleTheft'] = fbiraw_t2['MotorVehicleTheft'].apply(atof)
fbiraw_t2.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 348 entries, 4 to 351
Data columns (total 11 columns):
City                 348 non-null object
Population           348 non-null float64
Violent Crime        348 non-null float64
Murder               348 non-null float64
Rape                 348 non-null float64
Robbery              348 non-null float64
Assault              348 non-null float64
PropertyCrime        348 non-null float64
Burglary             348 non-null float64
Larceny & Theft      348 non-null float64
MotorVehicleTheft    348 non-null float64
dtypes: float64(10), object(1)
memory usage: 31.3+ KB



In [308]:

    
#Reindex the dataframe

fbiraw_t3 = fbiraw_t2.reset_index(drop=True)
fbiraw_t2.head()









    Out[308]:







  
    
      
      City
      Population
      Violent Crime
      Murder
      Rape
      Robbery
      Assault
      PropertyCrime
      Burglary
      Larceny & Theft
      MotorVehicleTheft
    
  
  
    
      4
      Adams Village
      1861.0
      0.0
      0.0
      0.0
      0.0
      0.0
      12.0
      2.0
      10.0
      0.0
    
    
      5
      Addison Town and Village
      2577.0
      3.0
      0.0
      0.0
      0.0
      3.0
      24.0
      3.0
      20.0
      1.0
    
    
      6
      Akron Village
      2846.0
      3.0
      0.0
      0.0
      0.0
      3.0
      16.0
      1.0
      15.0
      0.0
    
    
      7
      Albany
      97956.0
      791.0
      8.0
      30.0
      227.0
      526.0
      4090.0
      705.0
      3243.0
      142.0
    
    
      8
      Albion Village
      6388.0
      23.0
      0.0
      3.0
      4.0
      16.0
      223.0
      53.0
      165.0
      5.0



In [309]:

    
#Extract only the columns that are needed

fbiraw_t3 = fbiraw_t2



In [310]:

    
#Eliminate outliers

fbiraw_t3 = fbiraw_t3[fbiraw_t3.PropertyCrime < 450].reset_index(drop=True)

#Describe the dataset
fbiraw_t3.describe()









    Out[310]:







  
    
      
      Population
      Violent Crime
      Murder
      Rape
      Robbery
      Assault
      PropertyCrime
      Burglary
      Larceny & Theft
      MotorVehicleTheft
    
  
  
    
      count
      281.000000
      281.000000
      281.000000
      281.000000
      281.000000
      281.000000
      281.000000
      281.000000
      281.000000
      281.000000
    
    
      mean
      7921.523132
      8.231317
      0.053381
      0.608541
      1.825623
      5.743772
      119.836299
      19.039146
      98.220641
      2.576512
    
    
      std
      7687.278137
      12.442147
      0.240530
      1.249199
      3.596852
      8.590599
      115.075293
      19.842286
      96.809428
      3.619691
    
    
      min
      526.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      2498.000000
      1.000000
      0.000000
      0.000000
      0.000000
      1.000000
      32.000000
      5.000000
      23.000000
      0.000000
    
    
      50%
      5213.000000
      4.000000
      0.000000
      0.000000
      1.000000
      3.000000
      77.000000
      13.000000
      62.000000
      1.000000
    
    
      75%
      10374.000000
      11.000000
      0.000000
      1.000000
      2.000000
      7.000000
      180.000000
      25.000000
      150.000000
      3.000000
    
    
      max
      37438.000000
      132.000000
      2.000000
      9.000000
      34.000000
      87.000000
      442.000000
      107.000000
      410.000000
      21.000000



In [311]:

    
#Print length of dataset and sort values by Population to see how many datapoints are excluded
print(len(fbiraw_t3), len(fbiraw_t2) - len(fbiraw_t3))
fbiraw_t3.sort_values('PropertyCrime',ascending=False).head()









    



281 67






    Out[311]:







  
    
      
      City
      Population
      Violent Crime
      Murder
      Rape
      Robbery
      Assault
      PropertyCrime
      Burglary
      Larceny & Theft
      MotorVehicleTheft
    
  
  
    
      17
      Bethlehem Town
      34243.0
      13.0
      0.0
      0.0
      3.0
      10.0
      442.0
      50.0
      388.0
      4.0
    
    
      48
      Cicero Town
      29571.0
      12.0
      0.0
      0.0
      2.0
      10.0
      440.0
      45.0
      393.0
      2.0
    
    
      262
      Ulster Town
      12195.0
      14.0
      0.0
      3.0
      2.0
      9.0
      437.0
      25.0
      410.0
      2.0
    
    
      192
      Oneida
      11220.0
      18.0
      0.0
      2.0
      4.0
      12.0
      434.0
      68.0
      357.0
      9.0
    
    
      54
      Corning
      11087.0
      35.0
      0.0
      3.0
      8.0
      24.0
      416.0
      63.0
      349.0
      4.0



In [312]:

    
#Plot the relationships between variables
sns.set_style("white")

#Conisder only the vairables suitable for the model
dfcont = fbiraw_t3

# Scatterplot matrix.
g = sns.PairGrid(dfcont, diag_sharey=False)
g.map_upper(plt.scatter, alpha=.5)

# Fit line summarizing the linear relationship of the two variables.
g.map_lower(sns.regplot, scatter_kws=dict(alpha=0))

# Give information about the univariate distributions of the variables.
g.map_diag(sns.kdeplot, lw=3)
plt.show()









    



d:\users\borja.gonzalez\appdata\local\programs\python\python36-32\lib\site-packages\matplotlib\axes\_axes.py:545: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "



In [313]:

    
#Create the new feature Population2

#fbiraw_t3['Population2'] = fbiraw_t3['Population']*fbiraw_t3['Population']



In [314]:

    
#Convert Robbery into a categorical feature

fbiraw_t3.loc[fbiraw_t3['Robbery'] > 0, 'Robbery'] = 1



In [315]:

    
#Convert Murder into a categorical feature

fbiraw_t3.loc[fbiraw_t3['Murder'] > 0, 'Murder'] = 1



In [316]:

    
# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(7, 6))
ax.set_xscale("log")


# Define the variables that are going to be plot
df_long = fbiraw_t3

#Boxplot vairables
ax = sns.boxplot(data=df_long, orient="h", palette="Set2")



In [317]:

    
fbiraw_t2.head()









    Out[317]:







  
    
      
      City
      Population
      Violent Crime
      Murder
      Rape
      Robbery
      Assault
      PropertyCrime
      Burglary
      Larceny & Theft
      MotorVehicleTheft
    
  
  
    
      4
      Adams Village
      1861.0
      0.0
      0.0
      0.0
      0.0
      0.0
      12.0
      2.0
      10.0
      0.0
    
    
      5
      Addison Town and Village
      2577.0
      3.0
      0.0
      0.0
      0.0
      3.0
      24.0
      3.0
      20.0
      1.0
    
    
      6
      Akron Village
      2846.0
      3.0
      0.0
      0.0
      0.0
      3.0
      16.0
      1.0
      15.0
      0.0
    
    
      7
      Albany
      97956.0
      791.0
      8.0
      30.0
      227.0
      526.0
      4090.0
      705.0
      3243.0
      142.0
    
    
      8
      Albion Village
      6388.0
      23.0
      0.0
      3.0
      4.0
      16.0
      223.0
      53.0
      165.0
      5.0



In [318]:

    
#Transform dataset into final dataset with features

fbidata = fbiraw_t3.drop('City',axis=1)



In [319]:

    
names = fbidata.columns
df_scaled = pd.DataFrame(preprocessing.scale(fbidata), columns = names)
df_scaled.describe()









    Out[319]:







  
    
      
      Population
      Violent Crime
      Murder
      Rape
      Robbery
      Assault
      PropertyCrime
      Burglary
      Larceny & Theft
      MotorVehicleTheft
    
  
  
    
      count
      2.810000e+02
      2.810000e+02
      2.810000e+02
      2.810000e+02
      2.810000e+02
      2.810000e+02
      2.810000e+02
      2.810000e+02
      2.810000e+02
      2.810000e+02
    
    
      mean
      6.321555e-18
      1.896466e-17
      -1.896466e-17
      -5.689399e-17
      -1.390742e-16
      -1.264311e-17
      3.160777e-17
      5.057244e-17
      2.212544e-17
      -4.425088e-17
    
    
      std
      1.001784e+00
      1.001784e+00
      1.001784e+00
      1.001784e+00
      1.001784e+00
      1.001784e+00
      1.001784e+00
      1.001784e+00
      1.001784e+00
      1.001784e+00
    
    
      min
      -9.637634e-01
      -6.627476e-01
      -2.289857e-01
      -4.880140e-01
      -1.010734e+00
      -6.698042e-01
      -1.043231e+00
      -9.612357e-01
      -1.016387e+00
      -7.130745e-01
    
    
      25%
      -7.067780e-01
      -5.822322e-01
      -2.289857e-01
      -4.880140e-01
      -1.010734e+00
      -5.531902e-01
      -7.646560e-01
      -7.087990e-01
      -7.783833e-01
      -7.130745e-01
    
    
      50%
      -3.529670e-01
      -3.406861e-01
      -2.289857e-01
      -4.880140e-01
      9.893802e-01
      -3.199622e-01
      -3.729100e-01
      -3.049004e-01
      -3.748112e-01
      -4.363149e-01
    
    
      75%
      3.195998e-01
      2.229216e-01
      -2.289857e-01
      3.139271e-01
      9.893802e-01
      1.464937e-01
      5.237531e-01
      3.009476e-01
      5.358129e-01
      1.172042e-01
    
    
      max
      3.846503e+00
      9.965282e+00
      4.367085e+00
      6.729456e+00
      9.893802e-01
      9.475613e+00
      2.804585e+00
      4.440909e+00
      3.226293e+00
      5.098877e+00



In [320]:

    
fbidata.to_csv('fbidata.csv')

Linear regression model



In [321]:

    
# Instantiate and fit our model.
regr = linear_model.LinearRegression()

#Create Variables for the model by dropping variables from the initial list
Y = df_scaled['PropertyCrime'].values.ravel()
X = df_scaled.drop(['Violent Crime','Murder','Larceny & Theft','PropertyCrime', 'Rape','MotorVehicleTheft','Assault'],axis=1)
regr.fit(X, Y)

# Inspect the results.
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X, Y))
print('\nVariables in the model: \n',list(X.columns))









    



Coefficients: 
 [ 0.28185986  0.1676387   0.6022175 ]

Intercept: 
 2.26845917801e-17

R-squared:
0.812488657335

Variables in the model: 
 ['Population', 'Robbery', 'Burglary']



In [322]:

    
# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(7, 6))


# Define the variables that are going to be plot
df_long = X

#Boxplot vairables
ax = sns.boxplot(data=df_long, orient="h", palette="Set2")

Assumption 1: Linear relationship



In [323]:

    
# Establish the pair outcome-feature we want to compare
outcome = Y
feature = X['Burglary']

# Plot the data as-is. Looks a mite quadratic.
plt.scatter( feature,outcome)
plt.title('Relationshib between Outcome & Features')
plt.xlabel('Features')
plt.ylabel('Outcome')
plt.show()

Assumption 2: Multivariate normality



In [324]:

    
# Extract predicted values.
predicted = regr.predict(X).ravel()
actual = df_scaled['PropertyCrime']

# Calculate the error, also called the residual.
residual = actual - predicted

# Plot the distribution of errors.
plt.hist(residual)
plt.title('Residual counts')
plt.xlabel('Residual')
plt.ylabel('Count')
plt.show()

Assumption 3: Homoscedasticity



In [325]:

    
plt.scatter(predicted, residual)
plt.xlabel('Predicted')
plt.ylabel('Residual')
plt.axhline(y=0)
plt.title('Residual vs. Predicted')
plt.show()

Assumption 4: Low multicollinearity



In [326]:

    
# Build up the correlation mtrix
Z = X
correlation_matrix = Z.corr()
display(correlation_matrix)









    







  
    
      
      Population
      Robbery
      Burglary
    
  
  
    
      Population
      1.000000
      0.432624
      0.568601
    
    
      Robbery
      0.432624
      1.000000
      0.536684
    
    
      Burglary
      0.568601
      0.536684
      1.000000

PCA & RFE



In [327]:

    
#Eigenvectores & Eigenvalues

eig_vals, eig_vecs = np.linalg.eig(correlation_matrix)

# Inspecting the eigenvalues and eigenvectors.
for i in range(len(eig_vals)):
    eigvecs = eig_vecs[:, i].reshape(1, len(X.columns)).T
    print('Eigenvector {}: \n{}'.format(i + 1, eigvecs))
    print('Eigenvalue {}: {}'.format(i + 1, eig_vals[i]))
    print(40 * '-')


sklearn_pca = PCA(n_components=len(X.columns))
Y_sklearn = sklearn_pca.fit_transform(correlation_matrix)

print(
    'The percentage of total variance in the dataset explained by each',
    'component from Sklearn PCA.\n',
    sklearn_pca.explained_variance_ratio_
)









    



Eigenvector 1: 
[[-0.56929822]
 [-0.55606052]
 [-0.60555449]]
Eigenvalue 1: 2.027376437109163
----------------------------------------
Eigenvector 2: 
[[-0.65096928]
 [ 0.75476172]
 [-0.08107866]]
Eigenvalue 2: 0.5692173321158762
----------------------------------------
Eigenvector 3: 
[[-0.50213399]
 [-0.34803943]
 [ 0.79166281]]
Eigenvalue 3: 0.40340623077496046
----------------------------------------
The percentage of total variance in the dataset explained by each component from Sklearn PCA.
 [  6.60047913e-01   3.39952087e-01   2.77469582e-32]



In [328]:

    
#From the Scree plot.

plt.plot(eig_vals)
plt.show()



In [329]:

    
# create the RFE model and select features

nfeatures = (len(X.columns))
rfe = RFE(regr,nfeatures)
fit = rfe.fit(X,Y)

# summarize the selection of the features

result_RFE = pd.DataFrame(list(zip(X.head(0), rfe.ranking_, rfe.support_)),columns=['Features','Ranking','Support'] )
result_RFE.sort_values('Ranking')









    Out[329]:







  
    
      
      Features
      Ranking
      Support
    
  
  
    
      0
      Population
      1
      True
    
    
      1
      Robbery
      1
      True
    
    
      2
      Burglary
      1
      True



In [330]:

    
#Result as a list for "keywords"
result= result_RFE[result_RFE.Support == False]
drop = result["Features"].tolist()
drop









    Out[330]:





[]

Cross Validation & Predictive Power of the model



In [ ]:

    
#Initiating the cross validation generator, N splits = 10
kf = KFold(10)



In [331]:

    
#Cross validate the model on the folds

scores = cross_val_score(regr, X, Y, cv=kf)
print('Cross-validated scores:', scores)
print('Cross-validation average:', scores.mean())









    



Cross-validated scores: [ 0.91582688  0.8232567   0.83997206  0.74336812  0.62213282  0.75082512
  0.84946267  0.82764719  0.90397765  0.6330684 ]
Cross-validation average: 0.790953761621



In [332]:

    
#Plot Predictions vs values
predictions = cross_val_predict(regr, X, Y, cv=kf)
plt.scatter(Y, predictions)
plt.xlabel('Real Values')
plt.ylabel('Predicted Values')
plt.axhline(y=0)
plt.title('Real vs. Predicted')
plt.show()



In [333]:

    
#Predictive accuracy
accuracy = metrics.r2_score(Y, predictions)
print ('Cross-Predicted Accuracy:', accuracy)









    



Cross-Predicted Accuracy: 0.805132585817

Validating a linear regression



In [366]:

    
#Testing the results with statsmodel
linear_formula = 'PropertyCrime ~ Population+Robbery+Burglary'
lm = smf.ols(formula=linear_formula, data=df_scaled).fit()
print('Coefficients: \n', lm.params)
print('\n P values: \n', lm.pvalues)
print('\n R squared: \n', lm.rsquared)
print('\n Confidence Interval: \n', lm.conf_int())









    



Coefficients: 
 Intercept     3.122502e-17
Population    2.818599e-01
Robbery       1.676387e-01
Burglary      6.022175e-01
dtype: float64

 P values: 
 Intercept     1.000000e+00
Population    1.984592e-16
Robbery       1.898919e-07
Burglary      9.342222e-47
dtype: float64

 R squared: 
 0.812488657335

 Confidence Interval: 
                    0         1
Intercept  -0.051218  0.051218
Population  0.218520  0.345200
Robbery     0.105888  0.229389
Burglary    0.534539  0.669896



In [367]:

    
# Use wls_prediction_std to build confidence intervals
prstd, iv_l, iv_u = wls_prediction_std(lm)

plt.figure()
plt.plot(iv_u[0:15], 'o', color='r')
plt.plot(iv_l[0:15], 'o', color='r')
plt.plot(lm.fittedvalues[0:15], 'o', color='b')
plt.title('blue: predicted, red: 95% CI')
plt.show()

Saving the model



In [368]:

    
import pickle
# save the model to disk
filename = 'finalized_regr.sav'
pickle.dump(regr, open(filename, 'wb'))



In [ ]:

	Table 8	Unnamed: 1	Unnamed: 2	Unnamed: 3	Unnamed: 4	Unnamed: 5	Unnamed: 6	Unnamed: 7	Unnamed: 8	Unnamed: 9	Unnamed: 10	Unnamed: 11	Unnamed: 12
0	NEW YORK	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	Offenses Known to Law Enforcement	NaN	NaN	NaN	NaN		NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	by City, 2013	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	City	Population	Violent\ncrime	Murder and\nnonnegligent\nmanslaughter	Rape\n(revised\ndefinition)1	Rape\n(legacy\ndefinition)2	Robbery	Aggravated\nassault	Property\ncrime	Burglary	Larceny-\ntheft	Motor\nvehicle\ntheft	Arson3
4	Adams Village	1,861	0	0	NaN	0	0	0	12	2	10	0	0

	City	Population	Violent Crime	Murder	Rape	Robbery	Assault	PropertyCrime	Burglary	Larceny & Theft	MotorVehicleTheft
4	Adams Village	1861.0	0.0	0.0	0.0	0.0	0.0	12.0	2.0	10.0	0.0
5	Addison Town and Village	2577.0	3.0	0.0	0.0	0.0	3.0	24.0	3.0	20.0	1.0
6	Akron Village	2846.0	3.0	0.0	0.0	0.0	3.0	16.0	1.0	15.0	0.0
7	Albany	97956.0	791.0	8.0	30.0	227.0	526.0	4090.0	705.0	3243.0	142.0
8	Albion Village	6388.0	23.0	0.0	3.0	4.0	16.0	223.0	53.0	165.0	5.0

	Population	Violent Crime	Murder	Rape	Robbery	Assault	PropertyCrime	Burglary	Larceny & Theft	MotorVehicleTheft
count	281.000000	281.000000	281.000000	281.000000	281.000000	281.000000	281.000000	281.000000	281.000000	281.000000
mean	7921.523132	8.231317	0.053381	0.608541	1.825623	5.743772	119.836299	19.039146	98.220641	2.576512
std	7687.278137	12.442147	0.240530	1.249199	3.596852	8.590599	115.075293	19.842286	96.809428	3.619691
min	526.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	2498.000000	1.000000	0.000000	0.000000	0.000000	1.000000	32.000000	5.000000	23.000000	0.000000
50%	5213.000000	4.000000	0.000000	0.000000	1.000000	3.000000	77.000000	13.000000	62.000000	1.000000
75%	10374.000000	11.000000	0.000000	1.000000	2.000000	7.000000	180.000000	25.000000	150.000000	3.000000
max	37438.000000	132.000000	2.000000	9.000000	34.000000	87.000000	442.000000	107.000000	410.000000	21.000000

	City	Population	Violent Crime	Rape	Robbery	Assault	PropertyCrime	Burglary	Larceny & Theft	MotorVehicleTheft
17	Bethlehem Town	34243.0	13.0	0.0	3.0	10.0	442.0	50.0	388.0	4.0
48	Cicero Town	29571.0	12.0	0.0	2.0	10.0	440.0	45.0	393.0	2.0
262	Ulster Town	12195.0	14.0	3.0	2.0	9.0	437.0	25.0	410.0	2.0
192	Oneida	11220.0	18.0	2.0	4.0	12.0	434.0	68.0	357.0	9.0
54	Corning	11087.0	35.0	3.0	8.0	24.0	416.0	63.0	349.0	4.0

	Population	Violent Crime	Murder	Rape	Robbery	Assault	PropertyCrime	Burglary	Larceny & Theft	MotorVehicleTheft
count	2.810000e+02	2.810000e+02	2.810000e+02	2.810000e+02	2.810000e+02	2.810000e+02	2.810000e+02	2.810000e+02	2.810000e+02	2.810000e+02
mean	6.321555e-18	1.896466e-17	-1.896466e-17	-5.689399e-17	-1.390742e-16	-1.264311e-17	3.160777e-17	5.057244e-17	2.212544e-17	-4.425088e-17
std	1.001784e+00	1.001784e+00	1.001784e+00	1.001784e+00	1.001784e+00	1.001784e+00	1.001784e+00	1.001784e+00	1.001784e+00	1.001784e+00
min	-9.637634e-01	-6.627476e-01	-2.289857e-01	-4.880140e-01	-1.010734e+00	-6.698042e-01	-1.043231e+00	-9.612357e-01	-1.016387e+00	-7.130745e-01
25%	-7.067780e-01	-5.822322e-01	-2.289857e-01	-4.880140e-01	-1.010734e+00	-5.531902e-01	-7.646560e-01	-7.087990e-01	-7.783833e-01	-7.130745e-01
50%	-3.529670e-01	-3.406861e-01	-2.289857e-01	-4.880140e-01	9.893802e-01	-3.199622e-01	-3.729100e-01	-3.049004e-01	-3.748112e-01	-4.363149e-01
75%	3.195998e-01	2.229216e-01	-2.289857e-01	3.139271e-01	9.893802e-01	1.464937e-01	5.237531e-01	3.009476e-01	5.358129e-01	1.172042e-01
max	3.846503e+00	9.965282e+00	4.367085e+00	6.729456e+00	9.893802e-01	9.475613e+00	2.804585e+00	4.440909e+00	3.226293e+00	5.098877e+00

	Population	Robbery	Burglary
Population	1.000000	0.432624	0.568601
Robbery	0.432624	1.000000	0.536684
Burglary	0.568601	0.536684	1.000000