In [5]:
# coding: utf-8
import pandas as pd
In [6]:
df = pd.read_csv('kobis.csv'); df[:3]
Out[6]:
In [7]:
print df['title'][:3]
print
print df.ix[0]
In [8]:
df2 = df.reindex(index=df.id, columns=['genre', 'title', 'actor', 'director']); df2[:3]
Out[8]:
In [9]:
df2.dropna(how='any')[:3]
Out[9]:
In [10]:
gb = df2.groupby('director')
print gb
print gb.get_group('박찬욱').title
In [11]:
print (df2.director == '박찬욱')[:3]
print
print df2[df2.director == '박찬욱'].title
In [12]:
df2[:5]
Out[12]:
In [13]:
df3 = df.reindex(columns=['genre', 'title', 'director', 'actor']).dropna(how='any').sort('director'); df3[:5]
Out[13]:
In [14]:
for i in df3[:5].index:
data = df3.ix[i]
print data.title, ' '.join(data.director.replace(' ', '').split(',')), ' '.join(data.actor.split(','))
In [15]:
from gensim.models import word2vec
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [16]:
with open('sentences.txt', 'w') as fp:
for value in df3.values:
genre = value[0].replace(',', ' ')
title = value[1].replace(' ', '')
director = value[2].replace(' ', '')
for actor in value[3].split(','):
fp.write('%s %s %s %s\n' % (genre, title, director, actor.replace(' ', '')))
In [17]:
sentences = word2vec.Text8Corpus('sentences.txt')
In [18]:
model = word2vec.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
In [19]:
result = model.most_similar(positive=[u'올드보이', u'박찬욱'], negative=[u'스릴러'], topn=1)
In [20]:
for el in result:
print el[0]
In [31]:
result = model.most_similar(positive=[u'다크나이트라이즈', u'크리스토퍼놀란'], negative=[u'히스레저'], topn=1)
In [32]:
for el in result:
print el[0]