In [2]:
import json
import jieba

In [3]:
train_file1 = '../data/sogou_ccir/train.1.json'
with open(train_file1, 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f.readlines()]

In [4]:
train_data = []
for query in data:
    for text in query['passages']:
        train = [query['query_id'], query['query'], text['passage_id'], text['passage_text'], text['label']]
        train_data.append(train)

In [8]:
import pandas as pd
train = pd.DataFrame(train_data, columns=['query_id', 'query', 'passage_id', 'passage_text', 'label' ])

In [9]:
train.head()


Out[9]:
query_id query passage_id passage_text label
0 10000 108颗佛珠的含义 1 念珠,又称佛珠、数珠,主要是指一些宗教在祈祷、歌颂、念经、念咒或灵修时所用的物品,一般在各种... 2
1 10000 108颗佛珠的含义 2 慈悲是你最好的武器。<br>学佛就是在学做人而已。<br>沈默是毁谤最好的答覆。<br>你要... 2
2 10000 108颗佛珠的含义 3 现在有很多人带佛珠,说起来并不是真的信佛,而是看到不少演艺界的明星带,觉得这是一种时尚,所以... 2
3 10000 108颗佛珠的含义 4 佛珠的粒数各有其义:1080粒,是包括了十法界的各108个数;108粒,是表示单纯的108种... 2
4 10000 108颗佛珠的含义 5 1.慈悲是你最好的武器。<br>  2. 学佛就是在学做人而已。<br>  3. 沈默是毁谤... 0

In [10]:
train.describe()


Out[10]:
query_id passage_id label
count 40813.000000 40813.000000 40813.000000
mean 12259.132972 5.006640 1.010732
std 1316.761688 2.653118 0.982964
min 10000.000000 1.000000 0.000000
25% 11113.000000 3.000000 0.000000
50% 12256.000000 5.000000 1.000000
75% 13392.000000 7.000000 2.000000
max 14576.000000 20.000000 2.000000

In [11]:
import jieba

In [13]:
jieba_train = []
for text in train[['query', 'passage_text']].values:
    jieba_train.append(jieba.lcut(text[0]+text[1], cut_all=False))

In [30]:
with open('../data/sogou_ccir/jieba_train.csv', 'w', encoding='utf-8') as f:
    for data in jieba_train:
        f.write(",".join(data))

In [1]:
import word2vec

In [ ]: