In [2]:
import json
import jieba
In [3]:
train_file1 = '../data/sogou_ccir/train.1.json'
with open(train_file1, 'r', encoding='utf-8') as f:
data = [json.loads(line) for line in f.readlines()]
In [4]:
train_data = []
for query in data:
for text in query['passages']:
train = [query['query_id'], query['query'], text['passage_id'], text['passage_text'], text['label']]
train_data.append(train)
In [8]:
import pandas as pd
train = pd.DataFrame(train_data, columns=['query_id', 'query', 'passage_id', 'passage_text', 'label' ])
In [9]:
train.head()
Out[9]:
In [10]:
train.describe()
Out[10]:
In [11]:
import jieba
In [13]:
jieba_train = []
for text in train[['query', 'passage_text']].values:
jieba_train.append(jieba.lcut(text[0]+text[1], cut_all=False))
In [30]:
with open('../data/sogou_ccir/jieba_train.csv', 'w', encoding='utf-8') as f:
for data in jieba_train:
f.write(",".join(data))
In [1]:
import word2vec
In [ ]: