In [1]:
import pandas
import numpy
def main():
labels = read_labels()
communities = pandas.read_csv('data/communities_data.txt', sep=",", names=labels)
target = communities['ViolentCrimesPerPop numeric']
data = communities #.drop('ViolentCrimesPerPop numeric', axis=1)
missing_data = count_missing_data(data, labels)
print(len(missing_data))
features_to_remove = find_incomplete_features(labels, missing_data, data)
features_to_remove.append('communityname string')
data_with_feature_complete = data.drop(features_to_remove, axis=1)
print(data_with_feature_complete)
data_with_feature_complete.to_csv('data/data_with_feature_complete.csv', index=False, encoding='utf-8')
def find_incomplete_features(labels, missing_data, data):
#labels = labels[:-1] # Exclude 'ViolentCrimesPerPop numeric' feature
missing_features = []
for x in range(len(labels)):
if missing_data[x] > 0:
missing_features.append(labels[x])
return missing_features
def count_missing_data(data, labels):
#labels = labels[:-1] # Exclude 'ViolentCrimesPerPop numeric' feature
missing_data = []
for label in labels:
count = 0
for x in range(0, 1994):
if data[label][x] == '?':
count = count + 1
missing_data.append(count)
print(missing_data)
return missing_data
def read_labels():
labels = []
file = open('data/features_names.txt', 'r')
for line in file:
line = line[:-1]
labels.append(line)
return labels
main()
[0, 1174, 1177, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1675, 1675, 1675, 1675, 1675, 1675, 1675, 1675, 1675, 1675, 1675, 1675, 1675, 1675, 1675, 1675, 1675, 0, 0, 0, 1675, 1675, 1675, 1675, 0, 1675, 0]
128
state numeric fold numeric population numeric householdsize numeric \
0 8 1 0.19 0.33
1 53 1 0.00 0.16
2 24 1 0.00 0.42
3 34 1 0.04 0.77
4 42 1 0.01 0.55
5 6 1 0.02 0.28
6 44 1 0.01 0.39
7 6 1 0.01 0.74
8 21 1 0.03 0.34
9 29 1 0.01 0.40
10 6 1 0.13 0.71
11 36 1 0.02 0.46
12 25 1 0.03 0.47
13 55 1 0.01 0.44
14 6 1 0.04 0.36
15 19 1 0.03 0.34
16 36 1 0.15 0.31
17 34 1 0.01 0.53
18 18 1 0.02 0.47
19 42 1 0.00 0.41
20 6 1 0.25 0.54
21 12 1 1.00 0.42
22 41 1 0.01 0.34
23 19 1 0.11 0.43
24 6 1 0.02 0.96
25 8 1 0.00 0.33
26 6 1 0.06 0.49
27 39 1 0.01 0.37
28 54 1 0.01 0.27
29 9 1 0.00 0.50
... ... ... ... ...
1964 36 10 0.09 0.45
1965 18 10 1.00 0.36
1966 6 10 0.09 0.41
1967 25 10 0.01 0.66
1968 34 10 0.00 0.24
1969 6 10 1.00 0.29
1970 45 10 0.00 0.37
1971 1 10 0.06 0.39
1972 6 10 0.08 0.49
1973 39 10 0.00 0.38
1974 37 10 0.00 0.41
1975 53 10 0.05 0.27
1976 25 10 0.06 0.29
1977 47 10 0.01 0.35
1978 9 10 0.01 0.54
1979 34 10 0.02 0.43
1980 25 10 0.06 0.43
1981 9 10 0.07 0.38
1982 42 10 0.00 0.47
1983 13 10 0.01 0.45
1984 39 10 0.01 0.43
1985 1 10 0.01 0.41
1986 9 10 0.05 0.52
1987 44 10 0.01 0.65
1988 28 10 0.02 0.41
1989 12 10 0.01 0.40
1990 6 10 0.05 0.96
1991 9 10 0.16 0.37
1992 25 10 0.08 0.51
1993 6 10 0.20 0.78
racepctblack numeric racePctWhite numeric racePctAsian numeric \
0 0.02 0.90 0.12
1 0.12 0.74 0.45
2 0.49 0.56 0.17
3 1.00 0.08 0.12
4 0.02 0.95 0.09
5 0.06 0.54 1.00
6 0.00 0.98 0.06
7 0.03 0.46 0.20
8 0.20 0.84 0.02
9 0.06 0.87 0.30
10 0.15 0.07 1.00
11 0.08 0.91 0.07
12 0.01 0.96 0.13
13 0.00 0.98 0.04
14 0.01 0.85 0.14
15 0.06 0.93 0.03
16 0.40 0.63 0.14
17 0.01 0.94 0.20
18 0.01 0.97 0.07
19 0.05 0.96 0.01
20 0.05 0.71 0.48
21 0.47 0.59 0.12
22 0.02 0.87 0.07
23 0.04 0.89 0.09
24 0.05 0.00 1.00
25 0.02 0.91 0.16
26 0.46 0.00 1.00
27 0.01 0.99 0.02
28 0.43 0.64 0.08
29 0.01 0.98 0.02
... ... ... ...
1964 1.00 0.08 0.11
1965 0.44 0.64 0.06
1966 0.07 0.74 0.39
1967 0.00 0.97 0.10
1968 0.51 0.53 0.11
1969 0.21 0.29 1.00
1970 0.38 0.69 0.04
1971 0.32 0.73 0.04
1972 0.07 0.69 0.27
1973 0.03 0.97 0.04
1974 0.62 0.51 0.02
1975 0.03 0.89 0.27
1976 0.03 0.93 0.18
1977 0.06 0.94 0.06
1978 0.02 0.94 0.13
1979 0.39 0.63 0.15
1980 0.04 0.93 0.03
1981 0.17 0.84 0.11
1982 0.02 0.98 0.03
1983 0.37 0.65 0.15
1984 0.01 0.98 0.03
1985 0.55 0.57 0.01
1986 0.02 0.97 0.04
1987 0.00 1.00 0.01
1988 1.00 0.21 0.02
1989 0.10 0.87 0.12
1990 0.46 0.28 0.83
1991 0.25 0.69 0.04
1992 0.06 0.87 0.22
1993 0.14 0.46 0.24
racePctHisp numeric agePct12t21 numeric agePct12t29 numeric \
0 0.17 0.34 0.47
1 0.07 0.26 0.59
2 0.04 0.39 0.47
3 0.10 0.51 0.50
4 0.05 0.38 0.38
5 0.25 0.31 0.48
6 0.02 0.30 0.37
7 1.00 0.52 0.55
8 0.00 0.38 0.45
9 0.03 0.90 0.82
10 0.41 0.40 0.52
11 0.10 0.34 0.36
12 0.02 0.29 0.32
13 0.01 0.35 0.53
14 0.26 0.32 0.46
15 0.03 0.39 0.41
16 0.06 0.58 0.72
17 0.03 0.34 0.39
18 0.02 0.70 0.67
19 0.01 0.37 0.37
20 0.30 0.42 0.48
21 0.05 0.41 0.53
22 0.11 0.49 0.56
23 0.06 0.45 0.48
24 1.00 0.54 0.58
25 0.09 0.55 0.63
26 0.43 0.35 0.50
27 0.01 0.35 0.38
28 0.01 0.36 0.33
29 0.02 0.31 0.40
... ... ... ...
1964 0.14 0.36 0.46
1965 0.02 0.38 0.52
1966 0.45 0.29 0.49
1967 0.02 0.42 0.39
1968 0.11 0.20 0.36
1969 0.26 0.24 0.47
1970 0.02 0.34 0.48
1971 0.01 0.39 0.46
1972 0.35 0.46 0.50
1973 0.01 0.40 0.41
1974 0.01 0.38 0.39
1975 0.04 0.30 0.53
1976 0.03 0.20 0.42
1977 0.01 0.41 0.41
1978 0.04 0.39 0.43
1979 0.14 0.30 0.42
1980 0.09 0.36 0.51
1981 0.04 0.35 0.41
1982 0.01 0.62 0.47
1983 0.05 0.39 0.52
1984 0.02 0.43 0.44
1985 0.00 0.47 0.45
1986 0.02 0.40 0.43
1987 0.01 0.40 0.44
1988 0.01 0.54 0.57
1989 0.16 0.43 0.51
1990 0.32 0.69 0.86
1991 0.25 0.35 0.50
1992 0.10 0.58 0.74
1993 0.77 0.50 0.62
... PctForeignBorn numeric \
0 ... 0.12
1 ... 0.21
2 ... 0.14
3 ... 0.19
4 ... 0.11
5 ... 0.70
6 ... 0.15
7 ... 0.59
8 ... 0.01
9 ... 0.22
10 ... 1.00
11 ... 0.20
12 ... 0.22
13 ... 0.03
14 ... 0.40
15 ... 0.03
16 ... 0.23
17 ... 0.26
18 ... 0.08
19 ... 0.02
20 ... 0.38
21 ... 0.10
22 ... 0.12
23 ... 0.09
24 ... 1.00
25 ... 0.15
26 ... 0.87
27 ... 0.04
28 ... 0.05
29 ... 0.07
... ... ...
1964 ... 0.65
1965 ... 0.05
1966 ... 0.71
1967 ... 0.09
1968 ... 0.25
1969 ... 1.00
1970 ... 0.03
1971 ... 0.03
1972 ... 0.31
1973 ... 0.03
1974 ... 0.00
1975 ... 0.21
1976 ... 0.35
1977 ... 0.03
1978 ... 0.25
1979 ... 0.31
1980 ... 0.35
1981 ... 0.21
1982 ... 0.06
1983 ... 0.10
1984 ... 0.03
1985 ... 0.00
1986 ... 0.18
1987 ... 0.06
1988 ... 0.01
1989 ... 0.22
1990 ... 0.53
1991 ... 0.25
1992 ... 0.45
1993 ... 0.68
PctBornSameState numeric PctSameHouse85 numeric PctSameCity85 numeric \
0 0.42 0.50 0.51
1 0.50 0.34 0.60
2 0.49 0.54 0.67
3 0.30 0.73 0.64
4 0.72 0.64 0.61
5 0.42 0.49 0.73
6 0.81 0.77 0.91
7 0.58 0.52 0.79
8 0.78 0.48 0.79
9 0.42 0.34 0.23
10 0.35 0.53 0.39
11 0.88 0.91 0.90
12 0.66 0.82 0.66
13 0.86 0.43 0.47
14 0.42 0.31 0.51
15 0.85 0.60 0.74
16 0.80 0.41 0.53
17 0.64 0.80 0.74
18 0.58 0.40 0.49
19 0.98 0.91 0.78
20 0.58 0.31 0.48
21 0.44 0.45 0.60
22 0.41 0.37 0.51
23 0.72 0.58 0.72
24 0.43 0.54 0.70
25 0.34 0.31 0.36
26 0.39 0.57 0.82
27 0.82 0.63 0.82
28 0.82 0.68 0.79
29 0.71 0.68 0.64
... ... ... ...
1964 0.58 0.78 0.80
1965 0.72 0.47 0.73
1966 0.48 0.45 0.63
1967 0.74 0.79 0.81
1968 0.59 0.67 0.80
1969 0.30 0.48 0.54
1970 0.77 0.61 0.66
1971 0.71 0.47 0.61
1972 0.46 0.34 0.50
1973 0.74 0.61 0.77
1974 0.94 0.60 0.78
1975 0.42 0.36 0.55
1976 0.68 0.76 0.76
1977 0.69 0.49 0.62
1978 0.47 0.67 0.77
1979 0.67 0.77 0.75
1980 0.79 0.66 0.77
1981 0.67 0.69 0.80
1982 0.77 0.66 0.72
1983 0.66 0.56 0.53
1984 0.84 0.71 0.84
1985 0.93 0.66 0.82
1986 0.72 0.77 0.83
1987 0.77 0.65 0.84
1988 0.75 0.57 0.74
1989 0.28 0.34 0.48
1990 0.25 0.17 0.10
1991 0.68 0.61 0.79
1992 0.64 0.54 0.59
1993 0.50 0.34 0.35
PctSameState85 numeric LandArea numeric PopDens numeric \
0 0.64 0.12 0.26
1 0.52 0.02 0.12
2 0.56 0.01 0.21
3 0.65 0.02 0.39
4 0.53 0.04 0.09
5 0.64 0.01 0.58
6 0.84 0.05 0.08
7 0.78 0.01 0.33
8 0.75 0.04 0.17
9 0.09 0.00 0.47
10 0.62 0.02 1.00
11 0.93 0.01 0.63
12 0.73 0.03 0.18
13 0.69 0.08 0.04
14 0.53 0.02 0.40
15 0.78 0.04 0.15
16 0.80 0.06 0.39
17 0.73 0.03 0.09
18 0.45 0.03 0.20
19 0.93 0.09 0.03
20 0.73 0.09 0.46
21 0.48 1.00 0.07
22 0.45 0.05 0.09
23 0.64 0.16 0.12
24 0.74 0.02 0.22
25 0.44 0.02 0.15
26 0.72 0.01 0.79
27 0.82 0.04 0.09
28 0.78 0.02 0.17
29 0.78 0.10 0.02
... ... ... ...
1964 0.81 0.01 1.00
1965 0.74 1.00 0.17
1966 0.68 0.05 0.29
1967 0.77 0.09 0.04
1968 0.75 0.00 0.50
1969 0.51 0.13 1.00
1970 0.76 0.02 0.10
1971 0.60 0.14 0.09
1972 0.53 0.07 0.21
1973 0.80 0.01 0.18
1974 0.83 0.02 0.10
1975 0.45 0.03 0.31
1976 0.69 0.01 0.72
1977 0.58 0.04 0.12
1978 0.62 0.05 0.09
1979 0.81 0.01 0.53
1980 0.82 0.13 0.09
1981 0.72 0.09 0.13
1982 0.71 0.02 0.14
1983 0.66 0.02 0.16
1984 0.85 0.02 0.15
1985 0.84 0.11 0.03
1986 0.85 0.10 0.09
1987 0.71 0.16 0.02
1988 0.71 0.03 0.17
1989 0.39 0.01 0.28
1990 0.00 0.02 0.37
1991 0.76 0.08 0.32
1992 0.52 0.03 0.38
1993 0.68 0.11 0.30
PctUsePubTrans numeric LemasPctOfficDrugUn numeric \
0 0.20 0.32
1 0.45 0.00
2 0.02 0.00
3 0.28 0.00
4 0.02 0.00
5 0.10 0.00
6 0.06 0.00
7 0.00 0.00
8 0.04 0.00
9 0.11 0.00
10 1.00 0.00
11 1.00 0.00
12 0.59 0.00
13 0.00 0.00
14 0.15 0.00
15 0.04 0.00
16 0.84 0.88
17 0.21 0.00
18 0.07 0.00
19 0.05 0.00
20 0.05 0.76
21 0.15 0.31
22 0.06 0.00
23 0.07 0.56
24 0.02 0.00
25 0.16 0.00
26 0.21 0.00
27 0.00 0.00
28 0.00 0.00
29 0.08 0.00
... ... ...
1964 1.00 0.00
1965 0.18 0.44
1966 0.22 0.00
1967 0.04 0.00
1968 0.49 0.00
1969 1.00 0.64
1970 0.08 0.00
1971 0.01 0.00
1972 0.02 0.00
1973 0.10 0.00
1974 0.00 0.00
1975 0.34 0.00
1976 0.94 0.00
1977 0.01 0.00
1978 0.14 0.00
1979 0.43 0.00
1980 0.11 0.00
1981 0.17 0.62
1982 0.04 0.00
1983 0.13 0.00
1984 0.06 0.00
1985 0.01 0.00
1986 0.03 0.00
1987 0.02 0.00
1988 0.02 0.00
1989 0.05 0.00
1990 0.20 0.00
1991 0.18 0.91
1992 0.33 0.22
1993 0.05 1.00
ViolentCrimesPerPop numeric
0 0.20
1 0.67
2 0.43
3 0.12
4 0.03
5 0.14
6 0.03
7 0.55
8 0.53
9 0.15
10 0.24
11 0.08
12 0.06
13 0.09
14 0.21
15 0.30
16 0.49
17 0.07
18 0.15
19 0.03
20 0.34
21 0.69
22 0.21
23 0.63
24 0.31
25 0.12
26 0.84
27 0.10
28 0.49
29 0.02
... ...
1964 0.60
1965 0.69
1966 0.30
1967 0.04
1968 0.19
1969 0.75
1970 0.53
1971 0.22
1972 0.30
1973 0.04
1974 0.52
1975 0.09
1976 0.08
1977 0.25
1978 0.04
1979 0.16
1980 0.28
1981 0.07
1982 0.03
1983 0.56
1984 0.14
1985 0.14
1986 0.02
1987 0.04
1988 0.19
1989 0.09
1990 0.45
1991 0.23
1992 0.19
1993 0.48
[1994 rows x 102 columns]
In [2]:
import pandas
import numpy
communities = pandas.read_csv('data/data_with_feature_complete.csv', sep=",")
target = communities['ViolentCrimesPerPop numeric']
data = communities.drop('ViolentCrimesPerPop numeric', axis=1)
In [3]:
X = data
y = target
# particionar dados em treino e teste
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .1) # .5 metade dos dados para teste
print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))
1794
200
1794
200
In [31]:
# from sklearn.cross_validation import cross_val_predict
# from sklearn import linear_model
# regr = linear_model.LinearRegression()
# from sklearn.metrics import confusion_matrix
# y_pred = cross_val_predict(regr, data, target, cv=10)
# confusion = confusion_matrix(target, y_pred)
# print (confusion)
In [39]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
import numpy
from sklearn import metrics
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(X_train, y_train)
from sklearn.cross_validation import cross_val_predict
y_pred = cross_val_predict(regr, data, target, cv=10).mean()
metrics.r2_score(target, y_pred)
# Make predictions using the testing set
y_pred = regr.predict(X_test)
y_pred_array = numpy.array(y_pred.tolist())
#print y_pred_array
#print y_pred
#print y_pred_array
#confusion = confusion_matrix(y_test, y_pred_array)
# The coefficients
#print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-39-ac75eaa7b7fd> in <module>()
14 from sklearn.cross_validation import cross_val_predict
15 y_pred = cross_val_predict(regr, data, target, cv=10).mean()
---> 16 metrics.r2_score(target, y_pred)
17
18 # Make predictions using the testing set
/usr/lib/python2.7/dist-packages/sklearn/metrics/regression.pyc in r2_score(y_true, y_pred, sample_weight, multioutput)
442 """
443 y_type, y_true, y_pred, multioutput = _check_reg_targets(
--> 444 y_true, y_pred, multioutput)
445
446 if sample_weight is not None:
/usr/lib/python2.7/dist-packages/sklearn/metrics/regression.pyc in _check_reg_targets(y_true, y_pred, multioutput)
72
73 """
---> 74 check_consistent_length(y_true, y_pred)
75 y_true = check_array(y_true, ensure_2d=False)
76 y_pred = check_array(y_pred, ensure_2d=False)
/usr/lib/python2.7/dist-packages/sklearn/utils/validation.pyc in check_consistent_length(*arrays)
171 """
172
--> 173 uniques = np.unique([_num_samples(X) for X in arrays if X is not None])
174 if len(uniques) > 1:
175 raise ValueError("Found arrays with inconsistent numbers of samples: "
/usr/lib/python2.7/dist-packages/sklearn/utils/validation.pyc in _num_samples(x)
120 if len(x.shape) == 0:
121 raise TypeError("Singleton array %r cannot be considered"
--> 122 " a valid collection." % x)
123 return x.shape[0]
124 else:
TypeError: Singleton array 0.23799792187087593 cannot be considered a valid collection.
In [ ]:
In [ ]:
Content source: jarvis-fga/Projetos
Similar notebooks: