日本語ウィキペディアのダンプファイルをダウンロード。


In [0]:
# 日本語のウィキペディアの一部(約100万の記事)をダウンロードします (~270MB)
!wget https://dumps.wikimedia.org/jawiki/20190620/jawiki-20190620-pages-articles1.xml-p1p106175.bz2
# ※ 日本語のウィキペディアの全ての記事が入っているダンプファイル(~3GB)はこちらのリンクからダウンロードできます: https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles.xml.bz2

wikiextractorのインストールと、ウィキペディア記事のデータ抽出。(約40分)


In [0]:
# 圧縮しているウィキペディアのダンプファイルから記事のテキストデータを抽出するwikiextractorツールをインストールします。
!git clone https://github.com/attardi/wikiextractor.git
# このツールはpipからもライブラリーとしてインストールすることができます。(!pip install wikiextractor)
# ローカルのフォルダーに抽出したウィキペディア記事のデータを書き込んで、分かりやすいファイル名に変更します
!python ./wikiextractor/WikiExtractor.py jawiki-20190620-pages-articles1.xml-p1p106175.bz2 -o japanese_extracted_articles -b 500M --no_templates --filter_disambig_pages
!mv japanese_extracted_articles/AA/wiki_00 japanese_wikipedia_extracted_articles.txt

Mecab形態素解析ツールのインストールと必要なライブラリーのインポート。


In [0]:
# Mecab形態素解析ツールをインストール
!pip install mecab-python3

In [0]:
# 必要なライブラリーのインポート
import MeCab
from collections import Counter
import codecs
import nltk
import sqlite3
import re

テキストの単語出現頻度計算関数の定義とその計算。


In [0]:
stopwords = ['する', 'なる', 'ない', 'これ', 'それ', 'id', 'ja', 'wiki',
             'wikipedia', 'id', 'doc', 'https', 'org', 'url', 'いう', 'ある',
             'curid', 'あれ', 'それら', 'これら', 'それそれ', 'それぞれ',
             'title', 'その後', '一部', '前', 'よる', '一つ', 'ひとつ', '他',
             'その他', 'ほか', 'そのほか', 'いる']
word_categories = ['名詞', '動詞', '形容詞']
word_categories_to_avoid = ['非自立', '接尾', 'サ変接続', '数']

# 与えられたテキストの単語出現頻度の計算
def count_word_frequencies(text):
  all_nouns_verbs_adjs = []
  tagger = MeCab.Tagger()
  for line in text:
      node = tagger.parseToNode(line)
      while(node):
        lemma = node.feature.split(',')[6].lower() # 辞書形
        pos = node.feature.split(',')[0].lower()  # 品詞情報
        pos2 = node.feature.split(',')[1].lower() # 品詞情報2
        if lemma != '':
          if lemma == '*' and node.surface != "":
            lemma = node.surface
          if (pos in word_categories and
              pos2 not in word_categories_to_avoid and
              lemma not in stopwords):
            all_nouns_verbs_adjs.append(lemma)
        node = node.next
        if node is None:
            break
  return Counter(all_nouns_verbs_adjs)

ローカルのデータベースの作成とテキスト情報などの保存


In [0]:
# ウィキペディア記事のテキスト情報そのままを保存用のテーブル
create_article_text_table_sql = """
drop table if exists article_text;
create table article_text (
    id integer primary key autoincrement not null,
    article_id integer not null,
    title text not null,
    article text not null,
    article_url text not null
);
"""

# ウィキペディア全体の単語出現頻度情報を保存用のテーブル
create_wikipedia_word_frequencies_table_sql = """
drop table if exists wikipedia_word_frequencies;
create table wikipedia_word_frequencies (
    word text primary key not null,
    frequency integer not null
);
"""

# ウィキペディアの各記事の単語出現頻度情報を保存用のテーブル
create_article_word_frequencies_table_sql = """
drop table if exists article_word_frequencies;
create table article_word_frequencies (
    id integer primary key autoincrement not null,
    article_id integer not null,
    word text not null,
    frequency integer not null
);
"""

# ウィキペディアの各記事のtfidf値を保存用のテーブル
create_article_word_tfidfs_table_sql = """
drop table if exists article_word_tfidfs;
create table article_word_tfidfs (
    id integer primary key autoincrement not null,
    article_id integer not null,
    word text not null,
    tfidf_score integer not null
);
"""

DB_PATH = "japanese_wikipedia_analysis.db"

In [0]:
def initialize_database(db_path):
  db_connection = sqlite3.connect(db_path)
  database_initialization = [
                              create_article_text_table_sql,
                              create_wikipedia_word_frequencies_table_sql,
                              create_article_word_frequencies_table_sql,
                              create_article_word_tfidfs_table_sql
                              ]
  for sql_query in database_initialization:
    db_connection.executescript(sql_query)
  db_connection.commit()
  db_connection.close()

initialize_database(DB_PATH)

ウィキペディア全体のテキストファイルを記事に分割し、各記事のテキストと単語出現頻度の情報を保存する。


In [0]:
def save_article(db_path, id, url, title, text):
  db_connection = sqlite3.connect(db_path)
  insert_statement = u"""
  INSERT INTO article_text (article_id, article_url, title, article)
  VALUES (?, ?, ?, ?)"""
  db_connection.executemany(insert_statement, [(id, url, title, text)])
  db_connection.commit()
  db_connection.close()

def calculate_article_word_frequencies(db_path, article_id, text):
  db_connection = sqlite3.connect(db_path)
  insert_statement = u"""
  INSERT INTO article_word_frequencies (article_id, word, frequency)
  VALUES (?, ?, ?)"""
  article_word_frequencies = count_word_frequencies([text]).items()
  db_connection.executemany(insert_statement,
                            [(article_id, pair[0], pair[1]) for pair in article_word_frequencies])
  db_connection.commit()
  db_connection.close()

# テキストファイルを処理しながら、各記事に分割し、単語出現頻度を計算する
def parse_articles(db_path, file):
  article_header = re.compile(r'^<doc id=\"([0-9]+)\" url=\"(.*)\" title=\"(.*)\">$')
  article_footer = re.compile(r'^</doc>$')
  # それぞれの<doc>...</doc>の間のテキストが各記事になっています
  with open(file, 'r') as wikipedia_dump:
      article_text = ''
      article_id = 0
      article_url = ''
      article_title = ''
      for line in wikipedia_dump:
        if not line:
          continue
        header_found = article_header.search(line)
        footer_found = article_footer.search(line)
        if header_found:
          article_id = header_found.group(1)
          article_url = header_found.group(2)
          article_title = header_found.group(3)
          continue
        elif footer_found:
          save_article(db_path, article_id, article_url, article_title, article_text)
          calculate_article_word_frequencies(db_path, article_id, article_text)
          article_text = ''
          article_id = 0
          article_url = ''
          article_title = ''
        else:
          article_text += "\n" + line

parse_articles(DB_PATH, "japanese_wikipedia_extracted_articles.txt")

ウィキペディア全体の単語出現頻度情報を保存


In [0]:
def calculate_wikipedia_word_frequencies(db_path):
  # Open the database
  db_connection = sqlite3.connect(db_path)
  insert_statement = u"""
  INSERT INTO wikipedia_word_frequencies (word, frequency) VALUES (?, ?)"""
  with codecs.open("japanese_wikipedia_extracted_articles.txt", "r",'utf-8') as full_wiki:
    db_connection.executemany(insert_statement,
                              [(pair[0], pair[1]) for pair in count_word_frequencies(full_wiki).items()])
  # Commit the changes and close.
  db_connection.commit()
  db_connection.close()

calculate_wikipedia_word_frequencies(DB_PATH)

TF-IDF値を計算する関数の定義


In [0]:
from math import log
def tf_idf(word, doc_word_frequencies, corpus_word_frequencies, vocabulary_size):
  return tf(word, doc_word_frequencies) * idf(word, corpus_word_frequencies, vocabulary_size)

def tf(word, doc_word_frequencies):
  return log(1 + doc_word_frequencies[word])

def idf(word, corpus_word_frequencies, vocabulary_size):
  if word not in corpus_word_frequencies or corpus_word_frequencies[word] == 0:
    return 1
  else:
    return  log(vocabulary_size / corpus_word_frequencies[word])

各記事の単語のtfidf値を計算と保存する(>1時間)


In [0]:
def retrieve_articles_wordfreqs_by_id(db_path, article_id):
  db_connection = sqlite3.connect(db_path)
  retrieve_statement = u"""
  SELECT word, frequency
  FROM article_word_frequencies
  WHERE article_id = {seq}""".format(seq=str(article_id))
  result = []
  cursor = db_connection.execute(retrieve_statement)
  for row in cursor:
      result.append(row)
  db_connection.close()
  return result

def retrieve_wikipedia_wordfreqs(db_path, words_list):
  db_connection = sqlite3.connect(db_path)
  retrieve_statement = u"""
  SELECT word, frequency
  FROM wikipedia_word_frequencies
  WHERE word IN (\"{seq}\")""".format(seq='","'.join(words_list))
  result = []
  cursor = db_connection.execute(retrieve_statement)
  for row in cursor:
    result.append(row)
  db_connection.close()
  return result

wikipedia_frequencies = all_nouns_verbs_adjs

def retrieve_wikipedia_vocabulary_size(db_path):
  db_connection = sqlite3.connect(db_path)
  retrieve_statement = u"""
  SELECT COUNT(DISTINCT(word))
  FROM wikipedia_word_frequencies"""
  result = 0
  cursor = db_connection.execute(retrieve_statement)
  for row in cursor:
    result = row[0]
  db_connection.close()
  return result

wikipedia_vocabulary_size = retrieve_wikipedia_vocabulary_size(DB_PATH)

def save_article_tfidfs(db_path, article_id):
  insert_statement = u"""
  INSERT INTO article_word_tfidfs (article_id, word, tfidf_score)
  VALUES (?, ?, ?)"""
  article_word_frequencies = dict(retrieve_articles_wordfreqs_by_id(db_path, article_id))
  article_word_tfidfs_tuples = [(article_id, word, tf_idf(word, article_word_frequencies, wikipedia_frequencies, wikipedia_vocabulary_size)) for word in article_word_frequencies.keys()]
  db_connection = sqlite3.connect(db_path)
  db_connection.executemany(insert_statement, article_word_tfidfs_tuples)
  db_connection.commit()
  db_connection.close()

def calculate_articles_tfidfs(db_path):
  db_connection = sqlite3.connect(db_path)
  retrieve_statement = u"""
  SELECT DISTINCT(article_id) FROM article_word_frequencies"""
  cursor = db_connection.execute(retrieve_statement)
  articles = []
  for row in cursor:
      articles.append(row[0])
  db_connection.close()
  for article_id in articles:
    save_article_tfidfs(db_path, article_id)
  return result

calculate_articles_tfidfs(DB_PATH)

ヘルパー関数の定義


In [0]:
# article_idの記事の単語出現頻度情報をデータベースから読み込む
def retrieve_articles_wordfreqs_by_id(db_path, article_id):
  db_connection = sqlite3.connect(db_path)
  retrieve_statement = u"""
  SELECT word, frequency
  FROM article_word_frequencies
  WHERE article_id = {seq}""".format(seq=str(article_id))
  result = []
  cursor = db_connection.execute(retrieve_statement)
  for row in cursor:
      result.append(row)
  db_connection.close()
  return result

# ランダムにamount_articlesの記事をデータベースから読み込む
def retrieve_random_articles(db_path, amount_articles):
  db_connection = sqlite3.connect(db_path)
  retrieve_statement = u"""
  SELECT article_id, title, article
  FROM article_text
  ORDER BY RANDOM() LIMIT """ + str(amount_articles)
  result = []
  cursor = db_connection.execute(retrieve_statement)
  for row in cursor:
      result.append(row)
  return result

# article_idの記事をデータベースから読み込む
def retrieve_articles_by_ids(db_path, article_ids_list):
  db_connection = sqlite3.connect(db_path)
  retrieve_statement = u"""
  SELECT article_id, title, article
  FROM article_text
  WHERE article_id IN ({seq})""".format(seq=','.join(article_ids_list))
  result = []
  cursor = db_connection.execute(retrieve_statement)
  for row in cursor:
      result.append(row)
  return result

# article_titles_listの記事をデータベースから読み込む
def retrieve_articles_by_titles(db_path, article_titles_list):
  db_connection = sqlite3.connect(db_path)
  retrieve_statement = u"""
  SELECT article_id, title, article
  FROM article_text
  WHERE title IN (\"{seq}\")""".format(seq='","'.join(article_titles_list))
  result = []
  cursor = db_connection.execute(retrieve_statement)
  for row in cursor:
      result.append(row)
  return result

# 指定されたIDの記事の単語出現頻度情報をデータベースから読み込む
def retrieve_articles_wordfreqs_by_id(db_path, article_id):
  db_connection = sqlite3.connect(db_path)
  retrieve_statement = u"""
  SELECT word, frequency
  FROM article_word_frequencies
  WHERE article_id = {seq}""".format(seq=str(article_id))
  result = []
  cursor = db_connection.execute(retrieve_statement)
  for row in cursor:
      result.append(row)
  db_connection.close()
  return result

# 指定されたタイトルの記事の格単語とそのtfidf値をデータベースから読み込む
def retrieve_articles_words_tfidfs_by_title(db_path, article_title):
  db_connection = sqlite3.connect(db_path)
  retrieve_statement = u"""
  SELECT
    word,
    tfidf_score
  FROM article_word_tfidfs
  INNER JOIN article_text
   ON article_text.article_id = article_word_tfidfs.article_id
  WHERE title = \"{seq}\"
  ORDER BY tfidf_score DESC; 
  """.format(seq=article_title)
  result = []
  cursor = db_connection.execute(retrieve_statement)
  for row in cursor:
      result.append(row)
  return resutl

In [0]:
# ランダムに読み込んだ600件の記事ID
six_hundred_random_articles = [
    '15440', '48605', '97947', '93611', '9345', '15240', '44495', '73378', '29438', '105238', '30927', '53640', '24155', '16388', '62926', '99988',
    '105031', '63578', '15975', '105200', '10959', '2916', '25306', '100171', '64296', '87272', '655', '39045', '9882', '99104', '11836', '103448', 
    '100706', '46769', '5698', '91613', '34683', '2009', '98916', '82199', '96534', '42074', '46525', '86848', '4376', '87836', '61109', '23894', 
    '46551', '8580', '85456', '63773', '56844', '28672', '76188', '51948', '35791', '94852', '33394', '19173', '44734', '11243', '104952', '98372', 
    '39161', '97470', '105888', '43787', '79526', '92471', '71389', '76790', '10113', '98822', '29032', '31035', '71037', '70350', '62673', '79612', 
    '69329', '98759', '29391', '46890', '5270', '4015', '14061', '91990', '39171', '38310', '17703', '26351', '73463', '32801', '85657', '36473', 
    '56036', '59475', '80541', '75385', '43304', '75902', '65163', '2160', '34027', '101328', '99787', '77979', '33838', '37300', '71870', '28833', 
    '101072', '60008', '10817', '38461', '56193', '99743', '54179', '68782', '102308', '99242', '58054', '76002', '99845', '11579', '22268', '28195', 
    '73700', '24341', '52919', '47208', '23030', '6032', '3259', '34742', '85950', '52057', '87398', '87515', '17596', '104078', '8765', '69760', 
    '28743', '102245', '24170', '27917', '38795', '67501', '80972', '81837', '51431', '28953', '11541', '28066', '67014', '72834', '62063', '55171', 
    '42553', '72389', '104465', '996', '27759', '18708', '788', '71057', '43', '9946', '6405', '32749', '93255', '41615', '75802', '23958', '80370', 
    '22475', '56061', '98034', '79627', '40664', '103406', '18015', '79357', '96109', '51472', '1407', '40450', '19255', '42494', '51933', '58464', 
    '62683', '42788', '53284', '15769', '57347', '78889', '104672', '41921', '96299', '29146', '58826', '60446', '57672', '26751', '47341', '89190', 
    '59086', '8458', '83688', '15250', '57614', '63120', '88327', '105227', '63947', '56114', '86277', '97687', '67566', '53527', '94202', '30510', 
    '29298', '1141', '68031', '101086', '32043', '61914', '46464', '21415', '5580', '59604', '59779', '20689', '60200', '24634', '22223', '59525', 
    '102003', '54280', '16410', '55488', '11316', '72981', '45245', '24471', '33880', '69195', '46738', '92207', '75672', '105012', '71034', '86891', 
    '105846', '53905', '2819', '57681', '56451', '97783', '79576', '63061', '58991', '102999', '8385', '90767', '65215', '80039', '8165', '9255', 
    '57294', '24463', '23993', '50346', '26214', '34620', '66393', '80143', '79695', '86538', '40795', '5486', '45192', '2364', '74829', '17724', 
    '14849', '82345', '90376', '100555', '59575', '75381', '6423', '51596', '92150', '1008', '45999', '6027', '76978', '59333', '25758', '63831', 
    '61470', '4292', '805', '100886', '30471', '37969', '90659', '27857', '3762', '37457', '75108', '72829', '1251', '66628', '7373', '4979', 
    '17030', '40239', '38354', '13813', '2264', '93274', '26003', '90258', '66521', '12135', '65007', '59893', '77958', '16544', '63864', '24669',
    '92463', '67671', '9046', '33033', '1221', '100188', '8255', '4639', '41076', '48870', '17395', '12516', '20503', '54274', '98195', '87347', 
    '101120', '13649', '77670', '100414', '1929', '105199', '53789', '57956', '7079', '46059', '20132', '21751', '39519', '91745', '54276', '9867', 
    '15878', '51559', '37235', '63144', '103037', '28642', '34667', '14090', '67137', '35430', '81894', '29789', '64427', '47238', '8757', '25046', 
    '71370', '1872', '36144', '869', '70451', '78354', '56752', '92323', '104375', '82298', '72040', '40294', '55279', '22682', '33613', '2433',
    '57654', '10278', '76223', '26610', '38805', '90158', '10845', '4586', '105417', '94988', '83845', '79097', '50223', '76484', '24613', '90746', 
    '1834', '89419', '13679', '33452', '71476', '8535', '93329', '80573', '100066', '795', '46053', '65721', '54796', '51411', '75101', '85756', 
    '100863', '55421', '59800', '2706', '49940', '10687', '33194', '38376', '32910', '36938', '99280', '24176', '6108', '1530', '61890', '29106', 
    '30107', '85588', '78859', '82961', '44806', '83704', '33233', '81674', '88561', '33346', '22383', '12974', '13149', '82394', '47593', '7086', 
    '70752', '79314', '71824', '27348', '56837', '483', '14592', '11369', '100281', '51893', '66472', '3130', '100259', '83466', '67251', '786', 
    '29289', '77015', '103124', '67900', '105221', '34287', '83598', '55234', '1969', '58163', '55083', '41483', '4952', '42207', '12827', '34554', 
    '33742', '39553', '56041', '71923', '49543', '59083', '16484', '30947', '34219', '6124', '5067', '4783', '18112', '16137', '50516', '94644', 
    '26756', '20712', '38371', '44809', '3898', '35419', '37239', '13913', '65177', '16907', '22725', '32854', '97439', '7823', '90311', '20801', 
    '68840', '20145', '28710', '33826', '50104', '13302', '48102', '72616', '64795', '98879', '102759', '45726', '68458', '63728', '1577', '5372', 
    '35087', '14509', '88670', '30344', '84740', '15095', '57071', '39983', '41248', '31955', '4637', '104157', '104410', '11229', '24752', '72480', 
    '57253', '26239', '57305', '47046', '6942', '74589', '19206', '102740', '14086', '105098', '29852', '23707', '77355', '105856', '63242', '45972', 
    '19158', '68190', '89030', '66555', '71440', '57680', '73001', '27485', '54610', '85610', '7821', '12159', '79770', '76601', '39708', '33783', 
    '81392', '97379', '48891', '39678', '88120', '63076', '49849']