In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import time
import sklearn
from sklearn.preprocessing import LabelEncoder
%matplotlib inline
%config InlineBackend.figure_format = 'png'
In [2]:
%%time
# load data
labels = pd.read_csv('../input/labels.csv').astype(int)
trn = pd.read_csv('../input/train_ver2.csv')
fecha_dato = trn['fecha_dato']
# get index
train_index = (labels[fecha_dato == '2015-06-28'].sum(axis=1) > 0)
train_index = train_index[train_index == True]
# trim train data and save
train = trn.ix[train_index.index]
train.iloc[:,24:] = labels.ix[train_index.index]
# melt data into single target dataframe
data = []
for ind, (run, row) in enumerate(train.iterrows()):
for i in range(24):
if row[24+i] == 1:
temp = row[:24].values.tolist()
temp.append(i)
data.append(temp)
cols = trn.columns[:24].tolist()
cols.append('target')
train = pd.DataFrame(data, columns = cols)
train.to_csv('../input/train.csv',index=False)
In [3]:
print(trn.columns)
print(len(trn.columns))
In [6]:
np.unique(trn["sexo"].astype(str))
Out[6]:
In [14]:
f, ax = plt.subplots(figsize=(10, 5))
In [15]:
sns.countplot(x=trn["fecha_dato"], data=trn, alpha=0.5)
plt.show()
trn[col].value_counts()
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.value_counts.html
In [28]:
trn["fecha_dato"].value_counts()
Out[28]:
In [29]:
aaaaa = " 2 "
In [30]:
int(aaaaa)
Out[30]:
In [ ]:
train_data[col].replace(' NA',0,inplace=True)
train_data[col] = train_data[col].astype(int)
In [33]:
np.unique(trn["sexo"].astype(str)).tolist()
Out[33]:
In [34]:
np.unique(trn["sexo"].astype(str))
Out[34]:
In [35]:
label_cols = trn.columns[24:] .tolist()
trn.groupby(['fecha_dato'])[label_cols[i]].agg('sum')
Out[35]:
In [36]:
label_cols
Out[36]:
In [44]:
# np.asarray
# Convert the input to an array.
# Parameters
# ----------
# a : array_like
# Input data, in any form that can be converted to an array. This
# includes lists, lists of tuples, tuples, tuples of tuples, tuples
# of lists and ndarrays.
# dtype : data-type, optional
# By default, the data-type is inferred from the input data.
# order : {'C', 'F'}, optional
# Whether to use row-major (C-style) or
# column-major (Fortran-style) memory representation.
# Defaults to 'C'.
# Returns
# -------
# out : ndarray
# Array interpretation of `a`. No copy is performed if the input
# is already an ndarray. If `a` is a subclass of ndarray, a base
# class ndarray is returned.
# See Also
# --------
# asanyarray : Similar function which passes through subclasses.
# ascontiguousarray : Convert input to a contiguous array.
# asfarray : Convert input to a floating point ndarray.
# asfortranarray : Convert input to an ndarray with column-major
# memory order.
# asarray_chkfinite : Similar function which checks input for NaNs and Infs.
# fromiter : Create an array from an iterator.
# fromfunction : Construct an array by executing a function on grid
# positions.
# Examples
# --------
# Convert a list into an array:
# >>> a = [1, 2]
# >>> np.asarray(a)
# array([1, 2])
# Existing arrays are not copied:
# >>> a = np.array([1, 2])
# >>> np.asarray(a) is a
# True
# If `dtype` is set, array is copied only if dtype does not match:
# >>> a = np.array([1, 2], dtype=np.float32)
# >>> np.asarray(a, dtype=np.float32) is a
# True
# >>> np.asarray(a, dtype=np.float64) is a
# False
# Contrary to `asanyarray`, ndarray subclasses are not passed through:
# >>> issubclass(np.matrix, np.ndarray)
# True
# >>> a = np.matrix([[1, 2]])
# >>> np.asarray(a) is a
# False
# >>> np.asanyarray(a) is a
# True
.iloc
integer positon를 통해 값을 찾을 수 있다. label로는 찾을 수 없다
.loc
label 을 통해 값을 찾을 수 있다. integer position로는 찾을 수 없다.
.ix
integer position과 label모두 사용 할 수 있다. 만약 label이 숫자라면 label-based index만 된다.
http://yeyej.blogspot.kr/2016/02/pandas-dataframe-iloc-loc-ix.html
In [45]:
# data = []
# for ind, (run, row) in enumerate(train.iterrows()):
# for i in range(24):
# if row[24+i] == 1:
# temp = row[:24].values.tolist()
# temp.append(i)
# data.append(temp)
In [ ]:
# lb = LabelEncoder()
# skip_cols = ['fecha_dato','ncodpers','target']
# # histogram of features
# for col in trn.columns:
# if col in skip_cols:
# continue
# print('='*50)
# print('col : ', col)
# # check category or number
# if col in category_cols:
# x = lb.fit_transform(trn[col])
# sns.jointplot(x,np.asarray(trn['target'])*1.0, kind="kde")
# else:
# x = trn[col]
# sns.jointplot(x,trn['target'], kind="kde")
# plt.show()