In [3]:
import pandas as pd
import nazarkav as nk
import os
In [4]:
data_path = os.path.join(nk.__path__[0], 'data')
cols = ['comment', 'c', 'rate', 'name', 'date']
# Read the labeled data
data = pd.read_csv(os.path.join(data_path,'hotel-dataset.csv'), names=cols)
In [5]:
# Number of original records
len(data.index)
Out[5]:
In [6]:
# Counting missing values
pd.isnull(data).sum(axis=0)
Out[6]:
In [7]:
# Removing comment missing value records
data = data.dropna(axis=0)
In [8]:
# Number of records after removing missing values
len(data.index)
Out[8]:
In [9]:
# Sampling
pos_data = data[data['c'] == 'pos'].sample(n=2000)
neg_data = data[data['c'] == 'neg'].sample(n=2000)
In [10]:
# Concat
hotel_polarity = pd.concat([pos_data, neg_data], ignore_index=True)
In [11]:
# Save to file
hotel_polarity[['comment', 'c']].to_csv(
os.path.join(data_path,'hotel-polarity.tsv'),
sep='\t', encoding='utf8', index=False)