In [0]:
# 日本語のウィキペディアの一部(約100万の記事)をダウンロードします (~270MB)
!wget https://dumps.wikimedia.org/jawiki/20190620/jawiki-20190620-pages-articles1.xml-p1p106175.bz2
# ※ 日本語のウィキペディアの全ての記事が入っているダンプファイル(~3GB)はこちらのリンクからダウンロードできます: https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles.xml.bz2
In [0]:
# 圧縮しているウィキペディアのダンプファイルから記事のテキストデータを抽出するwikiextractorツールをインストールします。
!git clone https://github.com/attardi/wikiextractor.git
# このツールはpipからもライブラリーとしてインストールすることができます。(!pip install wikiextractor)
# ローカルのフォルダーに抽出したウィキペディア記事のデータを書き込んで、分かりやすいファイル名に変更します
!python ./wikiextractor/WikiExtractor.py jawiki-20190620-pages-articles1.xml-p1p106175.bz2 -o japanese_extracted_articles -b 500M --no_templates --filter_disambig_pages
!mv japanese_extracted_articles/AA/wiki_00 japanese_wikipedia_extracted_articles.txt
In [0]:
# Mecab形態素解析ツールをインストール
!pip install mecab-python3
In [0]:
# 必要なライブラリーのインポート
import MeCab
from collections import Counter
import codecs
import nltk
import sqlite3
import re
In [0]:
stopwords = ['する', 'なる', 'ない', 'これ', 'それ', 'id', 'ja', 'wiki',
'wikipedia', 'id', 'doc', 'https', 'org', 'url', 'いう', 'ある',
'curid', 'あれ', 'それら', 'これら', 'それそれ', 'それぞれ',
'title', 'その後', '一部', '前', 'よる', '一つ', 'ひとつ', '他',
'その他', 'ほか', 'そのほか', 'いる']
word_categories = ['名詞', '動詞', '形容詞']
word_categories_to_avoid = ['非自立', '接尾', 'サ変接続', '数']
# 与えられたテキストの単語出現頻度の計算
def count_word_frequencies(text):
all_nouns_verbs_adjs = []
tagger = MeCab.Tagger()
for line in text:
node = tagger.parseToNode(line)
while(node):
lemma = node.feature.split(',')[6].lower() # 辞書形
pos = node.feature.split(',')[0].lower() # 品詞情報
pos2 = node.feature.split(',')[1].lower() # 品詞情報2
if lemma != '':
if lemma == '*' and node.surface != "":
lemma = node.surface
if (pos in word_categories and
pos2 not in word_categories_to_avoid and
lemma not in stopwords):
all_nouns_verbs_adjs.append(lemma)
node = node.next
if node is None:
break
return Counter(all_nouns_verbs_adjs)
In [0]:
# ウィキペディア記事のテキスト情報そのままを保存用のテーブル
create_article_text_table_sql = """
drop table if exists article_text;
create table article_text (
id integer primary key autoincrement not null,
article_id integer not null,
title text not null,
article text not null,
article_url text not null
);
"""
# ウィキペディア全体の単語出現頻度情報を保存用のテーブル
create_wikipedia_word_frequencies_table_sql = """
drop table if exists wikipedia_word_frequencies;
create table wikipedia_word_frequencies (
word text primary key not null,
frequency integer not null
);
"""
# ウィキペディアの各記事の単語出現頻度情報を保存用のテーブル
create_article_word_frequencies_table_sql = """
drop table if exists article_word_frequencies;
create table article_word_frequencies (
id integer primary key autoincrement not null,
article_id integer not null,
word text not null,
frequency integer not null
);
"""
# ウィキペディアの各記事のtfidf値を保存用のテーブル
create_article_word_tfidfs_table_sql = """
drop table if exists article_word_tfidfs;
create table article_word_tfidfs (
id integer primary key autoincrement not null,
article_id integer not null,
word text not null,
tfidf_score integer not null
);
"""
DB_PATH = "japanese_wikipedia_analysis.db"
In [0]:
def initialize_database(db_path):
db_connection = sqlite3.connect(db_path)
database_initialization = [
create_article_text_table_sql,
create_wikipedia_word_frequencies_table_sql,
create_article_word_frequencies_table_sql,
create_article_word_tfidfs_table_sql
]
for sql_query in database_initialization:
db_connection.executescript(sql_query)
db_connection.commit()
db_connection.close()
initialize_database(DB_PATH)
In [0]:
def save_article(db_path, id, url, title, text):
db_connection = sqlite3.connect(db_path)
insert_statement = u"""
INSERT INTO article_text (article_id, article_url, title, article)
VALUES (?, ?, ?, ?)"""
db_connection.executemany(insert_statement, [(id, url, title, text)])
db_connection.commit()
db_connection.close()
def calculate_article_word_frequencies(db_path, article_id, text):
db_connection = sqlite3.connect(db_path)
insert_statement = u"""
INSERT INTO article_word_frequencies (article_id, word, frequency)
VALUES (?, ?, ?)"""
article_word_frequencies = count_word_frequencies([text]).items()
db_connection.executemany(insert_statement,
[(article_id, pair[0], pair[1]) for pair in article_word_frequencies])
db_connection.commit()
db_connection.close()
# テキストファイルを処理しながら、各記事に分割し、単語出現頻度を計算する
def parse_articles(db_path, file):
article_header = re.compile(r'^<doc id=\"([0-9]+)\" url=\"(.*)\" title=\"(.*)\">$')
article_footer = re.compile(r'^</doc>$')
# それぞれの<doc>...</doc>の間のテキストが各記事になっています
with open(file, 'r') as wikipedia_dump:
article_text = ''
article_id = 0
article_url = ''
article_title = ''
for line in wikipedia_dump:
if not line:
continue
header_found = article_header.search(line)
footer_found = article_footer.search(line)
if header_found:
article_id = header_found.group(1)
article_url = header_found.group(2)
article_title = header_found.group(3)
continue
elif footer_found:
save_article(db_path, article_id, article_url, article_title, article_text)
calculate_article_word_frequencies(db_path, article_id, article_text)
article_text = ''
article_id = 0
article_url = ''
article_title = ''
else:
article_text += "\n" + line
parse_articles(DB_PATH, "japanese_wikipedia_extracted_articles.txt")
In [0]:
def calculate_wikipedia_word_frequencies(db_path):
# Open the database
db_connection = sqlite3.connect(db_path)
insert_statement = u"""
INSERT INTO wikipedia_word_frequencies (word, frequency) VALUES (?, ?)"""
with codecs.open("japanese_wikipedia_extracted_articles.txt", "r",'utf-8') as full_wiki:
db_connection.executemany(insert_statement,
[(pair[0], pair[1]) for pair in count_word_frequencies(full_wiki).items()])
# Commit the changes and close.
db_connection.commit()
db_connection.close()
calculate_wikipedia_word_frequencies(DB_PATH)
In [0]:
from math import log
def tf_idf(word, doc_word_frequencies, corpus_word_frequencies, vocabulary_size):
return tf(word, doc_word_frequencies) * idf(word, corpus_word_frequencies, vocabulary_size)
def tf(word, doc_word_frequencies):
return log(1 + doc_word_frequencies[word])
def idf(word, corpus_word_frequencies, vocabulary_size):
if word not in corpus_word_frequencies or corpus_word_frequencies[word] == 0:
return 1
else:
return log(vocabulary_size / corpus_word_frequencies[word])
In [0]:
def retrieve_articles_wordfreqs_by_id(db_path, article_id):
db_connection = sqlite3.connect(db_path)
retrieve_statement = u"""
SELECT word, frequency
FROM article_word_frequencies
WHERE article_id = {seq}""".format(seq=str(article_id))
result = []
cursor = db_connection.execute(retrieve_statement)
for row in cursor:
result.append(row)
db_connection.close()
return result
def retrieve_wikipedia_wordfreqs(db_path, words_list):
db_connection = sqlite3.connect(db_path)
retrieve_statement = u"""
SELECT word, frequency
FROM wikipedia_word_frequencies
WHERE word IN (\"{seq}\")""".format(seq='","'.join(words_list))
result = []
cursor = db_connection.execute(retrieve_statement)
for row in cursor:
result.append(row)
db_connection.close()
return result
wikipedia_frequencies = all_nouns_verbs_adjs
def retrieve_wikipedia_vocabulary_size(db_path):
db_connection = sqlite3.connect(db_path)
retrieve_statement = u"""
SELECT COUNT(DISTINCT(word))
FROM wikipedia_word_frequencies"""
result = 0
cursor = db_connection.execute(retrieve_statement)
for row in cursor:
result = row[0]
db_connection.close()
return result
wikipedia_vocabulary_size = retrieve_wikipedia_vocabulary_size(DB_PATH)
def save_article_tfidfs(db_path, article_id):
insert_statement = u"""
INSERT INTO article_word_tfidfs (article_id, word, tfidf_score)
VALUES (?, ?, ?)"""
article_word_frequencies = dict(retrieve_articles_wordfreqs_by_id(db_path, article_id))
article_word_tfidfs_tuples = [(article_id, word, tf_idf(word, article_word_frequencies, wikipedia_frequencies, wikipedia_vocabulary_size)) for word in article_word_frequencies.keys()]
db_connection = sqlite3.connect(db_path)
db_connection.executemany(insert_statement, article_word_tfidfs_tuples)
db_connection.commit()
db_connection.close()
def calculate_articles_tfidfs(db_path):
db_connection = sqlite3.connect(db_path)
retrieve_statement = u"""
SELECT DISTINCT(article_id) FROM article_word_frequencies"""
cursor = db_connection.execute(retrieve_statement)
articles = []
for row in cursor:
articles.append(row[0])
db_connection.close()
for article_id in articles:
save_article_tfidfs(db_path, article_id)
return result
calculate_articles_tfidfs(DB_PATH)
In [0]:
# article_idの記事の単語出現頻度情報をデータベースから読み込む
def retrieve_articles_wordfreqs_by_id(db_path, article_id):
db_connection = sqlite3.connect(db_path)
retrieve_statement = u"""
SELECT word, frequency
FROM article_word_frequencies
WHERE article_id = {seq}""".format(seq=str(article_id))
result = []
cursor = db_connection.execute(retrieve_statement)
for row in cursor:
result.append(row)
db_connection.close()
return result
# ランダムにamount_articlesの記事をデータベースから読み込む
def retrieve_random_articles(db_path, amount_articles):
db_connection = sqlite3.connect(db_path)
retrieve_statement = u"""
SELECT article_id, title, article
FROM article_text
ORDER BY RANDOM() LIMIT """ + str(amount_articles)
result = []
cursor = db_connection.execute(retrieve_statement)
for row in cursor:
result.append(row)
return result
# article_idの記事をデータベースから読み込む
def retrieve_articles_by_ids(db_path, article_ids_list):
db_connection = sqlite3.connect(db_path)
retrieve_statement = u"""
SELECT article_id, title, article
FROM article_text
WHERE article_id IN ({seq})""".format(seq=','.join(article_ids_list))
result = []
cursor = db_connection.execute(retrieve_statement)
for row in cursor:
result.append(row)
return result
# article_titles_listの記事をデータベースから読み込む
def retrieve_articles_by_titles(db_path, article_titles_list):
db_connection = sqlite3.connect(db_path)
retrieve_statement = u"""
SELECT article_id, title, article
FROM article_text
WHERE title IN (\"{seq}\")""".format(seq='","'.join(article_titles_list))
result = []
cursor = db_connection.execute(retrieve_statement)
for row in cursor:
result.append(row)
return result
# 指定されたIDの記事の単語出現頻度情報をデータベースから読み込む
def retrieve_articles_wordfreqs_by_id(db_path, article_id):
db_connection = sqlite3.connect(db_path)
retrieve_statement = u"""
SELECT word, frequency
FROM article_word_frequencies
WHERE article_id = {seq}""".format(seq=str(article_id))
result = []
cursor = db_connection.execute(retrieve_statement)
for row in cursor:
result.append(row)
db_connection.close()
return result
# 指定されたタイトルの記事の格単語とそのtfidf値をデータベースから読み込む
def retrieve_articles_words_tfidfs_by_title(db_path, article_title):
db_connection = sqlite3.connect(db_path)
retrieve_statement = u"""
SELECT
word,
tfidf_score
FROM article_word_tfidfs
INNER JOIN article_text
ON article_text.article_id = article_word_tfidfs.article_id
WHERE title = \"{seq}\"
ORDER BY tfidf_score DESC;
""".format(seq=article_title)
result = []
cursor = db_connection.execute(retrieve_statement)
for row in cursor:
result.append(row)
return resutl
In [0]:
# ランダムに読み込んだ600件の記事ID
six_hundred_random_articles = [
'15440', '48605', '97947', '93611', '9345', '15240', '44495', '73378', '29438', '105238', '30927', '53640', '24155', '16388', '62926', '99988',
'105031', '63578', '15975', '105200', '10959', '2916', '25306', '100171', '64296', '87272', '655', '39045', '9882', '99104', '11836', '103448',
'100706', '46769', '5698', '91613', '34683', '2009', '98916', '82199', '96534', '42074', '46525', '86848', '4376', '87836', '61109', '23894',
'46551', '8580', '85456', '63773', '56844', '28672', '76188', '51948', '35791', '94852', '33394', '19173', '44734', '11243', '104952', '98372',
'39161', '97470', '105888', '43787', '79526', '92471', '71389', '76790', '10113', '98822', '29032', '31035', '71037', '70350', '62673', '79612',
'69329', '98759', '29391', '46890', '5270', '4015', '14061', '91990', '39171', '38310', '17703', '26351', '73463', '32801', '85657', '36473',
'56036', '59475', '80541', '75385', '43304', '75902', '65163', '2160', '34027', '101328', '99787', '77979', '33838', '37300', '71870', '28833',
'101072', '60008', '10817', '38461', '56193', '99743', '54179', '68782', '102308', '99242', '58054', '76002', '99845', '11579', '22268', '28195',
'73700', '24341', '52919', '47208', '23030', '6032', '3259', '34742', '85950', '52057', '87398', '87515', '17596', '104078', '8765', '69760',
'28743', '102245', '24170', '27917', '38795', '67501', '80972', '81837', '51431', '28953', '11541', '28066', '67014', '72834', '62063', '55171',
'42553', '72389', '104465', '996', '27759', '18708', '788', '71057', '43', '9946', '6405', '32749', '93255', '41615', '75802', '23958', '80370',
'22475', '56061', '98034', '79627', '40664', '103406', '18015', '79357', '96109', '51472', '1407', '40450', '19255', '42494', '51933', '58464',
'62683', '42788', '53284', '15769', '57347', '78889', '104672', '41921', '96299', '29146', '58826', '60446', '57672', '26751', '47341', '89190',
'59086', '8458', '83688', '15250', '57614', '63120', '88327', '105227', '63947', '56114', '86277', '97687', '67566', '53527', '94202', '30510',
'29298', '1141', '68031', '101086', '32043', '61914', '46464', '21415', '5580', '59604', '59779', '20689', '60200', '24634', '22223', '59525',
'102003', '54280', '16410', '55488', '11316', '72981', '45245', '24471', '33880', '69195', '46738', '92207', '75672', '105012', '71034', '86891',
'105846', '53905', '2819', '57681', '56451', '97783', '79576', '63061', '58991', '102999', '8385', '90767', '65215', '80039', '8165', '9255',
'57294', '24463', '23993', '50346', '26214', '34620', '66393', '80143', '79695', '86538', '40795', '5486', '45192', '2364', '74829', '17724',
'14849', '82345', '90376', '100555', '59575', '75381', '6423', '51596', '92150', '1008', '45999', '6027', '76978', '59333', '25758', '63831',
'61470', '4292', '805', '100886', '30471', '37969', '90659', '27857', '3762', '37457', '75108', '72829', '1251', '66628', '7373', '4979',
'17030', '40239', '38354', '13813', '2264', '93274', '26003', '90258', '66521', '12135', '65007', '59893', '77958', '16544', '63864', '24669',
'92463', '67671', '9046', '33033', '1221', '100188', '8255', '4639', '41076', '48870', '17395', '12516', '20503', '54274', '98195', '87347',
'101120', '13649', '77670', '100414', '1929', '105199', '53789', '57956', '7079', '46059', '20132', '21751', '39519', '91745', '54276', '9867',
'15878', '51559', '37235', '63144', '103037', '28642', '34667', '14090', '67137', '35430', '81894', '29789', '64427', '47238', '8757', '25046',
'71370', '1872', '36144', '869', '70451', '78354', '56752', '92323', '104375', '82298', '72040', '40294', '55279', '22682', '33613', '2433',
'57654', '10278', '76223', '26610', '38805', '90158', '10845', '4586', '105417', '94988', '83845', '79097', '50223', '76484', '24613', '90746',
'1834', '89419', '13679', '33452', '71476', '8535', '93329', '80573', '100066', '795', '46053', '65721', '54796', '51411', '75101', '85756',
'100863', '55421', '59800', '2706', '49940', '10687', '33194', '38376', '32910', '36938', '99280', '24176', '6108', '1530', '61890', '29106',
'30107', '85588', '78859', '82961', '44806', '83704', '33233', '81674', '88561', '33346', '22383', '12974', '13149', '82394', '47593', '7086',
'70752', '79314', '71824', '27348', '56837', '483', '14592', '11369', '100281', '51893', '66472', '3130', '100259', '83466', '67251', '786',
'29289', '77015', '103124', '67900', '105221', '34287', '83598', '55234', '1969', '58163', '55083', '41483', '4952', '42207', '12827', '34554',
'33742', '39553', '56041', '71923', '49543', '59083', '16484', '30947', '34219', '6124', '5067', '4783', '18112', '16137', '50516', '94644',
'26756', '20712', '38371', '44809', '3898', '35419', '37239', '13913', '65177', '16907', '22725', '32854', '97439', '7823', '90311', '20801',
'68840', '20145', '28710', '33826', '50104', '13302', '48102', '72616', '64795', '98879', '102759', '45726', '68458', '63728', '1577', '5372',
'35087', '14509', '88670', '30344', '84740', '15095', '57071', '39983', '41248', '31955', '4637', '104157', '104410', '11229', '24752', '72480',
'57253', '26239', '57305', '47046', '6942', '74589', '19206', '102740', '14086', '105098', '29852', '23707', '77355', '105856', '63242', '45972',
'19158', '68190', '89030', '66555', '71440', '57680', '73001', '27485', '54610', '85610', '7821', '12159', '79770', '76601', '39708', '33783',
'81392', '97379', '48891', '39678', '88120', '63076', '49849']