In [1]:
%matplotlib inline
# import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import json
# from nltk.tokenize import WhitespaceTokenizer
# from string import punctuation
# from pymorphy2 import MorphAnalyzer
In [9]:
data = pd.DataFrame.from_csv('avito_train.tsv', sep='\t')
# data = pd.read_csv('avito_train.tsv', sep='\t', nrows=30000)
In [11]:
data.head(1)
Out[11]:
In [12]:
data.shape
Out[12]:
In [13]:
data.is_proved.unique()
Out[13]:
In [21]:
print("Data info")
print("Non checked: " + str(data.is_proved.isnull().sum()))
print("Prohibited: " + str(len(data[data.is_proved == 0.])))
print("Correct: " + str(len(data[data.is_proved == 1.])))
pd.DataFrame([data.is_proved.isnull().sum(), len(data[data.is_proved == 0.]), ])
In [18]:
data.describe()
Out[18]:
In [22]:
data.category.unique()
Out[22]:
In [24]:
data.isnull().any()
Out[24]:
In [53]:
desc = pd.DataFrame(.append(pd.Series(data.is_proved.isnull().sum()))).T
desc.columns = [0, 1, "NaN"]
desc
Out[53]:
In [54]:
data.is_blocked.value_counts()
Out[54]:
In [55]:
274996 / 3720807
Out[55]:
In [ ]: