git clone https://github.com/NeerajSarwan/CPT.git
to download CPTgit
, download and install it.cd CPT
to enter into the folder
In [1]:
from CPT import *
import pandas as pd
In [2]:
sample_poem = open('sample_sonnets.txt').read().lower().replace('\n', '') # smaller data sample
all_poem = open('sonnets.txt').read().lower().replace('\n', '') # larger data sample
In [3]:
def generate_char_seq(whole_str, n):
"""
Generate a dataframe, each row contains a sequence with length n.
Next sequence is 1 character of the previous sequence.
param: whole_str: original text in string format
param: n: the length of each sequence
return: a dataframe that contains all the sequences.
"""
dct = {}
idx = 0 # the index of the dataframe, the key of each key-value in the dictionary
for i in range(len(whole_str)-n):
sub_str = whole_str[i:i+n]
dct[idx] = {}
for j in range(n):
dct[idx][j] = sub_str[j]
idx += 1
df = pd.DataFrame(dct)
return df
In [37]:
# I'm using 410 character sequence to train
train_seq_len = 410
training_df = generate_char_seq(sample_poem, train_seq_len).T
training_df.head()
Out[37]:
In [77]:
# The testing data will use 20 characters, and I only choose 7 seperate rows from all sequences.
## The poem output will try to predict characters based on the 20 characters in each row, 7 rows in total
test_seq_len = 20
all_testing_df = generate_char_seq(sample_poem, test_seq_len).T
testing_df = all_testing_df.iloc[[77, 99, 177, 199, 277, 299, 410],:]
testing_df.head()
Out[77]:
In [78]:
# This python open source has a bit weird model input requirement, so I will use its own functions to load the data.
training_df.to_csv("train.csv", index=False)
testing_df.to_csv("test.csv")
In [79]:
model = CPT()
train, test = model.load_files("train.csv", "test.csv")
model.train(train)
Out[79]:
In [80]:
predict_len = 10
predictions = model.predict(train,test,test_seq_len,predict_len)
In [81]:
predictions
Out[81]:
In [82]:
for i in range(testing_df.shape[0]):
all_char_lst = testing_df.iloc[i].tolist()
all_char_lst.append(' ')
all_char_lst.extend(predictions[i])
print(''.join(all_char_lst))
In [84]:
# I'm using 410 character sequence to train
train_seq_len = 410
training_df = generate_char_seq(all_poem, train_seq_len).T
training_df.head()
Out[84]:
In [91]:
# The testing data will use 30 characters, and I only choose 10 seperate rows from all sequences.
## The poem output will try to predict characters based on the 20 characters in each row, 10 rows in total
test_seq_len = 30
all_testing_df = generate_char_seq(all_poem, test_seq_len).T
testing_df = all_testing_df.iloc[[1,2,3,4,5,77, 99, 177, 199, 277, 299, 410],:]
testing_df.head()
Out[91]:
In [92]:
training_df.to_csv("all_train.csv", index=False)
testing_df.to_csv("all_test.csv")
In [93]:
model = CPT()
train, test = model.load_files("all_train.csv", "all_test.csv")
model.train(train)
Out[93]:
In [94]:
predict_len = 10 # predict the next 10 characters
predictions = model.predict(train,test,test_seq_len,predict_len)
In [95]:
for i in range(testing_df.shape[0]):
all_char_lst = testing_df.iloc[i].tolist()
all_char_lst.append(' ')
all_char_lst.extend(predictions[i])
print(''.join(all_char_lst))
In [4]:
all_words = all_poem.split()
print(len(all_words))
In [5]:
# With selected sequence length to train
train_seq_len = 1000
training_df = generate_char_seq(all_words, train_seq_len).T
training_df.head()
Out[5]:
In [15]:
# The testing data will use 20 words, and I only choose 10 seperate rows from all sequences.
## The poem output will try to predict characters based on the 20 characters in each row, 10 rows in total
test_seq_len = 20
output_poem_rows = 10
all_testing_df = generate_char_seq(all_words, test_seq_len).T
selected_row_idx_lst = [train_seq_len*i for i in range(output_poem_rows)]
testing_df = all_testing_df.iloc[selected_row_idx_lst,:]
testing_df.head()
Out[15]:
In [16]:
training_df.to_csv("all_train_words.csv", index=False)
testing_df.to_csv("all_test_words.csv")
In [17]:
model = CPT()
train, test = model.load_files("all_train_words.csv", "all_test_words.csv")
model.train(train)
Out[17]:
In [18]:
predict_len = 10 # predict the next 10 words
predictions = model.predict(train,test,test_seq_len,predict_len)
In [21]:
for i in range(testing_df.shape[0]):
all_char_lst = testing_df.iloc[i].tolist()
all_char_lst.extend(predictions[i])
print(' '.join(all_char_lst))