notebook.community

Edit and run



In [3]:

    
import pandas as pd
import nazarkav as nk
import os



In [4]:

    
data_path = os.path.join(nk.__path__[0], 'data')
cols = ['comment', 'c', 'rate', 'name', 'date']

# Read the labeled data
data = pd.read_csv(os.path.join(data_path,'hotel-dataset.csv'), names=cols)



In [5]:

    
# Number of original records
len(data.index)









    Out[5]:





8499



In [6]:

    
# Counting missing values
pd.isnull(data).sum(axis=0)









    Out[6]:





comment    14
c           0
rate        0
name        2
date        0
dtype: int64



In [7]:

    
# Removing comment missing value records
data = data.dropna(axis=0)



In [8]:

    
# Number of records after removing missing values
len(data.index)









    Out[8]:





8483



In [9]:

    
# Sampling
pos_data = data[data['c'] == 'pos'].sample(n=2000)
neg_data = data[data['c'] == 'neg'].sample(n=2000)



In [10]:

    
# Concat
hotel_polarity = pd.concat([pos_data, neg_data], ignore_index=True)



In [11]:

    
# Save to file
hotel_polarity[['comment', 'c']].to_csv(
    os.path.join(data_path,'hotel-polarity.tsv'),
    sep='\t', encoding='utf8', index=False)