In [1]:
import os, re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from pandas import DataFrame, Series

def clean(instring):
    instring = re.sub(r'~~~START\|.+?\|START~~~', "", instring)
    
    instring = re.sub(r'[a-zA-Z0-9]', "", instring)
    
    # Decide on unwanted characters
    unwanted_chars = ['』','。', '!', ',', ':', '、', '(',
                      ')', ';', '?', '〉', '〈', '」', '「',
                      '『', '“', '”', '!', '"', '#', '$', '%',
                      '&', "'", '(', ')', '*', '+', ',', '-',
                      '.', '/', "《", "》", "·"]
    
    for char in unwanted_chars:
        instring = instring.replace(char, "")
    
    return instring

def textBreak(inputstring):
    # Decide how long each section should be
    divlim = 10000
    
    loops = len(inputstring)//divlim
    
    save = []
    
    for i in range(0, loops):
        save.append(inputstring[i * divlim: (i + 1) * divlim])
    
    return save

def info_for_graph(input_list):
    unique_values = set(input_list)
    
    unique_labels = [i for i in range(0, len(unique_values))]
    unique_dictionary = dict(zip(unique_values, unique_labels))
    
    class_list = []
    for item in input_list:
        class_list.append(unique_dictionary[item])
    
    return unique_labels, np.array(class_list), unique_values

info_list = []
title_list = []
author_list = []
era_list = []
genre_list = []
section_number = []
title_author = {}
title_era = {}
title_genre = {}

# Decide whether or not to break the text apart
break_apart = False

# Decide whether or not to normalize
normalize = True

metadata = {}

# Add the name of the metadata file
metadatafile = open("metadata.txt", "r", encoding="utf8")
metadatastring = metadatafile.read()
metadatafile.close()

lines = metadatastring.split("\n")
for line in lines:
    cells = line.split("\t")
    metadata[cells[0]] = cells[1:]

# Here you will set the name of the directory the files are stored in.
for root, dirs, files in os.walk("corpus"):
    for filename in files:
        if filename[0] != ".":
            f = open(root + "/" + filename, "r", encoding="utf8")
            c = f.read()
            f.close()
            c = re.sub("\s+", "", c)
            c = clean(c)
            
            metainfo = metadata[filename[:-4]]
            
            title_author[metainfo[0]] = metainfo[1]
            title_era[metainfo[0]] = metainfo[2]
            title_genre[metainfo[0]] = metainfo[3]
            
            if not break_apart:
                info_list.append(c)
                title_list.append(metainfo[0])
                author_list.append(metainfo[1])
                era_list.append(metainfo[2])
                genre_list.append(metainfo[3])

            else:
                broken_sections = textBreak(c)
                
                info_list.extend(broken_sections)

                title_list.extend([metainfo[0] for i in 
                                   range(0,len(broken_sections))])
                author_list.extend([metainfo[1] for i in 
                                    range(0,len(broken_sections))])
                era_list.extend([metainfo[2] for i in 
                                 range(0,len(broken_sections))])
                genre_list.extend([metainfo[3] for i in 
                                   range(0,len(broken_sections))])
                section_number.extend([i for i in range(0, len(broken_sections))])

# Decide how to construct the vectorizer
vectorizer = CountVectorizer(analyzer="char",ngram_range=(1,1),
                             max_features = 100)

word_count_matrix = vectorizer.fit_transform(info_list)

if normalize:
    vectorizer = CountVectorizer(analyzer="char", ngram_range=(1,1),
                                 max_features=100)
    word_count_matrix=vectorizer.fit_transform(info_list)
    vocab = vectorizer.get_feature_names()

    dense_words = word_count_matrix.toarray()

    corpus_dataframe = DataFrame(dense_words, columns=vocab)

    doclengths = corpus_dataframe.sum(axis=1)

    thousand = Series([1000 for i in range(0,len(doclengths))])

    adjusteddoclengths = thousand.divide(doclengths)

    per_thousand = corpus_dataframe.multiply(adjusteddoclengths, axis = 0)

    print(per_thousand)

    word_count_matrix = per_thousand.as_matrix()

vocab = vectorizer.get_feature_names()

pca = PCA(n_components = 2)

if not normalize:
    dense_words = word_count_matrix.toarray()
else:
    dense_words = word_count_matrix
    
my_pca = pca.fit(dense_words).transform(dense_words)

unique_labels, info_labels, unique_genres = info_for_graph(genre_list)

# Make a color list, the same length as unique labels
colors = ["red", "magenta", "blue"]

plt.figure()

# This code is partially adapted from brandonrose.org/clustering
for color, each_class, label in zip(colors, unique_labels, unique_genres):
    plt.scatter(my_pca[info_labels == each_class, 0],
               my_pca[info_labels == each_class, 1],
               label = label, color = color)

#Decide whether or not to annotate the plot
annotate_plot = False
if annotate_plot:
    for i, text_label in enumerate(title_list):
        plt.annotate(text_label,  xy = (my_pca[i, 0], my_pca[i, 1]),
                     xytext=(my_pca[i, 0], my_pca[i, 1]), 
                     size=8)

plt.title("Principal Component Analysis")
plt.xlabel("PC1: " + "{0:.2f}".format(pca.explained_variance_ratio_[0] * 100)+"%")
plt.ylabel("PC2: " + "{0:.2f}".format(pca.explained_variance_ratio_[1] * 100)+"%")
plt.legend()

plt.show()

loadings = pca.components_

plt.scatter(loadings[0], loadings[1], alpha=0)

plt.title("Principal Component Loadings")
plt.xlabel("PC1: " + "{0:.2f}".format(pca.explained_variance_ratio_[0] * 100)+"%")
plt.ylabel("PC2: " + "{0:.2f}".format(pca.explained_variance_ratio_[1] * 100)+"%")

for i, txt in enumerate(vocab):
    plt.annotate(txt, (loadings[0, i], loadings[1, i]), horizontalalignment='center',
                 verticalalignment='center', size=8)
    
plt.show()


            一          三          上          下          不          与  \
0   27.877143   6.845936  26.273591  11.841618  34.353028   4.687307   
1   27.817947  10.084481  20.016744  18.190121  43.001751   3.957683   
2   33.331367   6.725267  25.721196  14.453425  39.407705   5.781370   
3   29.895996   8.547728  20.337698  13.979536  36.885764   2.863278   
4   38.371217   7.975542  15.722156  12.288242  29.767971   4.083773   
5   42.140946   4.799286  10.776093   6.458519  63.157895   8.474576   
6   41.840849   4.217528  11.221968   6.539955  55.385244   8.546532   
7   33.805693   9.313420  19.724803  16.507965  28.553450   7.159230   
8   32.400955   3.266042  10.132487   7.366650  39.909375   2.819335   
9   32.770973   8.023090   9.766867   8.821963  60.276255  11.699624   
10  31.887596  13.490596  14.924652  12.000145  35.605667   8.982989   
11  30.403190   6.492741  15.408553  12.897865  34.868650  10.731604   
12  25.658167   9.200013  13.090122  17.731512  42.880336  11.944100   
13  37.947215   9.809103  13.140194  15.342238  40.445533   9.576887   

            两          个          中          为    ...             还  \
0    2.898729  12.951770   9.682990   8.819539    ...      7.586037   
1    4.071847  11.188066   8.410077   8.219804    ...      6.659563   
2    4.483511  11.857708   6.725267   6.607280    ...      5.958351   
3    2.105352  13.979536   9.474083  10.526759    ...      4.926523   
4   11.246991  21.725966   5.280104   3.086830    ...      5.154563   
5    6.940232  17.341659   8.367529   8.367529    ...      8.688671   
6    7.914832  22.536834   7.431767   5.778199    ...      8.695167   
7   12.157850  22.373397   9.506045   5.178403    ...      1.836359   
8    5.946283  15.019513   4.477768   3.391762    ...      6.660479   
9    4.638617   0.188981  17.729826  21.801502    ...      1.417355   
10   5.200466  22.828070   6.888301   3.871144    ...      4.890291   
11  11.508074  15.502213   7.450489   2.764476    ...      5.519888   
12   4.806927   1.642632  17.215802  16.871996    ...      1.738134   
13   7.775215  16.431248  13.908907   8.559944    ...      4.259953   

            这          道          那         都          里          门  \
0   11.224867   6.352535  11.163192  3.083755   2.096953  11.964969   
1   19.636198   6.621508  13.661618  5.175432   7.877312   8.372022   
2   13.037579   6.076338  11.090791  3.539614   9.556958   9.792933   
3   19.158701   9.937261  13.095288  3.789633  10.021475   8.126658   
4   22.508751  33.393888  14.503670  6.469050  18.860679   7.635843   
5   16.449599  16.913470   6.155219  7.297056   4.299732   4.281891   
6   16.145514  25.472382   7.543244  6.521376   8.249261   4.551957   
7   13.641063  34.682137  16.719853  9.936241  17.766449   6.777191   
8   20.698307  29.332855  13.032069  7.112535  13.690092   3.327564   
9    0.008590   6.167643   0.146031  2.138918   3.461783   7.859879   
10  15.250940  44.419469  30.099055  4.749302  11.383823   7.214589   
11  14.894935  30.237020  14.795232  6.124145  15.641192  22.136955   
12   0.853150   3.482635   1.578964  4.819661   3.196129   4.590456   
13  10.129400  30.780564  11.154351  4.780436   9.624932   7.310784   

            问         面          马  
0    4.687307  3.083755   8.017762  
1    7.648984  3.462973   8.257858  
2    2.005781  3.362633   2.772698  
3    4.589667  3.158028   3.200135  
4    4.741016  4.083773   2.747131  
5    3.282783  7.154326   0.535236  
6    4.143210  5.722461   0.780336  
7    3.541090  7.444957  14.170781  
8    5.323033  4.755956   0.591151  
9   11.751164  1.975707   4.930678  
10   3.726128  3.830862   4.568031  
11   5.571249  5.121078   2.096772  
12   6.519594  5.138000  24.976920  
13   5.685276  4.091797   2.490311  

[14 rows x 100 columns]
//anaconda/lib/python3.5/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [ ]: