open()
. pandas
and NLTK
have their own functions for reading text files. dir()
will list every method or function available to call upon an object..read()
will read the contents of a text file. We can print the reasult or save the read text to another variable..readlines()
will split our file into a list. Each item in this list is one line of our text file.
In [ ]:
infile = open('./rawText/obama_speeches_000.txt','r')
print(dir(infile))
In [ ]:
help(open)
In [ ]:
title = infile.readline()
date = infile.readline()
text = infile.read()
print(title, date, text)
In [ ]:
title = title[8:-3]
date = date[7:-3]
print(title,date)
In [ ]:
title = title.replace(' ','_')
print(title,date)
In [ ]:
outfile = open('./cleanText/' + date + '_' + title + '.txt', 'w')
outfile.write(text)
for
loop, python can batch process our text files.glob.glob
creates a list of filenames based on a pattern..close()
method will close the file before moving on to the next file in the loop.
In [ ]:
from datetime import datetime
import glob
filenames = glob.glob('./rawText/*.txt')
for filename in filenames:
with open(filename, 'r') as infile:
title = infile.readline().replace(' ','_')[8:-3]
date = infile.readline()[7:-3]
text = infile.read()
outfile = open('./cleanText/' + title + '.txt', 'w')
outfile.write(text)
outfile.close()
BeautifulSoup
library contains functions for working with XML & HTML files.BeautifulSoup
is commonly used when web-scraping in python to parse HTML.with open() as :
automatically closes the file after the indented lines are complete. Meaning the .close()
method is not necessary.
In [ ]:
from bs4 import BeautifulSoup
with open('./NYHT_raw/1322266375.xml','r') as infile:
soup = BeautifulSoup(infile.read(), 'xml')
print(type(soup))
print(dir(soup))
print(soup)
In [ ]:
date = soup.find('NumericPubDate').text
recordid = soup.find('RecordID').text
print(date, recordid)
new_file = date + '_' + recordid + '.txt'
print(new_file)
In [ ]:
text = soup.find('FullText').text
print(text)
In [ ]:
with open('./NYHT_clean/' + new_file, 'w') as outfile:
outfile.write(text)
In [ ]:
import glob
filenames = glob.glob('./NYHT_raw/*.xml')
for filename in filenames:
with open(filename,'r') as infile:
soup = BeautifulSoup(infile.read(), 'xml')
date = soup.find('NumericPubDate').text
text = soup.find('FullText').text
recordid = soup.find('RecordID').text
with open('./NYHT_clean/' + date + '_' + recordid + '.txt', 'w') as outfile:
outfile.write(text)
utf-8 encoding allows us to work with scripts outside of the latin alphabet. See the full Unicode character list here: all unicode characters
In [ ]:
rus = open('./rawText/anna_k.txt', encoding='utf-8')
print(rus.read())
lower()
method will convert all characters in a string to lowercase.strip()
by default will remove leading and trailing whitespace (spaces & tabs).
In [ ]:
infile = open('./NYHT_clean/19520926_1322266375.txt', 'r')
text = infile.read().lower()
print(text)
text = text.strip()
print(text)
string
library inclides pre-build strings containing letters, numbers, and punctuation.list()
function breaks a string into a list of characters..replace()
function will replace one string with another. In this case, we replace each special charcter in our list with an empty string. This is equivalent to deleting the special characters.
In [ ]:
import string
special = list(string.punctuation)
print(special)
for x in special:
text = text.replace(x,'')
print(text)
NLTK
will provide built-in stopwords
In [ ]:
stopwords = {'the', 'do', 'in', 'can', "aren't", 'needn', "mightn't", 'it', 'you', 'through', 'her', 'against', 'because', 'into', "hasn't", 'down', 'why', "don't", 'again', 'than', "you're", 'themselves', 'does', 'weren', 'this', 'll', 'don', 'they', "didn't", 'as', 'there', 'too', 'off', "haven't", "mustn't", 'shan', 'all', 'such', 're', 'have', 'that', 'over', "it's", 'most', 'those', 'and', 'yours', "couldn't", 'under', 'didn', 'so', 'myself', 'itself', 'herself', "that'll", 'theirs', 'should', 'how', 'am', "you've", 'be', 'will', 'd', "needn't", 'i', 'is', 'couldn', 'or', 'himself', 'other', 'your', 'below', 'our', 'by', 'before', 'between', 'just', 'hasn', 'ain', "hadn't", 'not', 'did', 'his', 'shouldn', "wasn't", 'to', "isn't", 'wouldn', 'haven', 'wasn', 'ma', 'some', 'a', 'their', 'them', 'up', 'few', 'yourselves', 'doesn', 'm', 't', 'at', 'during', 'then', 'when', 'me', 'while', 'aren', 'more', 'isn', 'we', 'above', 'after', 'where', 'until', 'whom', 'o', "shan't", 'once', 'being', 'him', 'hers', 'an', 's', 'but', 'further', 'ours', 'both', "you'll", 'from', 'own', 'nor', 'who', 'been', "wouldn't", 'very', "won't", 'on', 'same', 'no', 'doing', 'what', 'she', 'mightn', 'ourselves', 'if', "should've", 'hadn', 'was', 'of', "you'd", 'are', "she's", 'yourself', 'out', 'he', 'mustn', "doesn't", 'which', 'having', "weren't", 'won', 'for', 'its', 'had', 'has', 'about', 'here', 'only', 'y', 'now', "shouldn't", 'my', 'each', 'were', 'with', 'these', 've', 'any'}
print(stopwords)
for stop in stopwords:
text = text.replace(' '+stop+' ',' ')
print(text)
In [ ]:
word_list = text.split()
print(word_list)
.isalpha()
, .isnumeric()
, .isalnum()
, .isascii()
return True if all characters in the string match the criteria..isalpha()
returns True if there is at least one character & all characters are letters (latin alphabet)..isnumeric()
returns True if there is at least one character & all characters are digits. .isalnum()
returns True if there is at least one character & all characters are letters or digits..isascii()
returns True if all characters are ascii letters, digits, or special characters. See ASCII Table.
In [ ]:
w = ['abc','123','!?-','a2?']
for x in w:
print(x, x.isalpha(), x.isnumeric(), x.isalnum(), x.isascii())
In [ ]:
final_words = []
for word in word_list:
if word.isalpha() == True:
final_words.append(word)
print(final_words)
In [ ]:
import os
cmd = 'python ./ascii_art.py --file logo.jpg --cols 120'
os.system(cmd)
open('out.txt', 'r').read()
In [ ]:
from collections import Counter
pets = ["dog", "cat", "bird", "gnu", "dog", "dog", "cat"]
pet_counts = Counter(pets)
print(pet_counts)
In [ ]:
word_counts = Counter(final_words)
print(word_counts)
print(len(word_counts))
print(word_counts.most_common(10))
In [ ]:
import json
word_dict = dict(word_counts)
print(word_dict)
with open('./output/19511216_1322285142.json', 'w') as outfile:
json.dump(word_dict, outfile)
In [ ]:
import csv
with open('./output/19511216_1322285142.csv', 'w') as f:
w = csv.writer(f)
w.writerow(['word','count'])
for word in word_dict.keys():
row = [word, word_dict[word]]
print(row)
w.writerow(row)
In [ ]:
stopwords = {'the', 'do', 'in', 'can', "aren't", 'needn', "mightn't", 'it', 'you', 'through', 'her', 'against', 'because', 'into', "hasn't", 'down', 'why', "don't", 'again', 'than', "you're", 'themselves', 'does', 'weren', 'this', 'll', 'don', 'they', "didn't", 'as', 'there', 'too', 'off', "haven't", "mustn't", 'shan', 'all', 'such', 're', 'have', 'that', 'over', "it's", 'most', 'those', 'and', 'yours', "couldn't", 'under', 'didn', 'so', 'myself', 'itself', 'herself', "that'll", 'theirs', 'should', 'how', 'am', "you've", 'be', 'will', 'd', "needn't", 'i', 'is', 'couldn', 'or', 'himself', 'other', 'your', 'below', 'our', 'by', 'before', 'between', 'just', 'hasn', 'ain', "hadn't", 'not', 'did', 'his', 'shouldn', "wasn't", 'to', "isn't", 'wouldn', 'haven', 'wasn', 'ma', 'some', 'a', 'their', 'them', 'up', 'few', 'yourselves', 'doesn', 'm', 't', 'at', 'during', 'then', 'when', 'me', 'while', 'aren', 'more', 'isn', 'we', 'above', 'after', 'where', 'until', 'whom', 'o', "shan't", 'once', 'being', 'him', 'hers', 'an', 's', 'but', 'further', 'ours', 'both', "you'll", 'from', 'own', 'nor', 'who', 'been', "wouldn't", 'very', "won't", 'on', 'same', 'no', 'doing', 'what', 'she', 'mightn', 'ourselves', 'if', "should've", 'hadn', 'was', 'of', "you'd", 'are', "she's", 'yourself', 'out', 'he', 'mustn', "doesn't", 'which', 'having', "weren't", 'won', 'for', 'its', 'had', 'has', 'about', 'here', 'only', 'y', 'now', "shouldn't", 'my', 'each', 'were', 'with', 'these', 've', 'any'}
punct = {'!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~'}
digits = ['0','1','2','3','4','5','6','7','8','9']
In [ ]:
import csv, json, glob
from collections import Counter
cbd = open('./output/counts_by_date.csv', 'w')
w = csv.writer(cbd)
w.writerow(['word','count','date'])
for filename in glob.glob('./NYHT_clean/*.txt'):
with open(filename, 'r') as infile:
text = infile.read().lower()
for p in punct:
text = text.replace(p,'')
for d in digits:
text = text.replace(d,'')
for stop in stopwords:
text = text.replace(' '+stop+' ',' ')
words_by_file = Counter(text.split())
word_dict = dict(words_by_file)
for key in word_dict.keys():
row = [key, word_dict[key], filename[13:17]]
print(row)
w.writerow(row)
cbd.close()
In [ ]:
import csv, json, glob
from collections import Counter
total_words = Counter()
for filename in glob.glob('./NYHT_clean/*.txt'):
with open(filename, 'r') as infile:
text = infile.read().lower()
for stop in stopwords:
text = text.replace(' '+stop+' ',' ')
for p in punct:
text = text.replace(p,'')
for d in digits:
text = text.replace(d,'')
total_words.update(text.split())
print(total_words)
with open('./output/total_counts.csv', 'w') as outfile:
w = csv.writer(outfile)
w.writerow(['word','count'])
word_dict = dict(total_words)
for key in word_dict.keys():
row = [key, word_dict[key]]
print(row)
w.writerow(row)
In [ ]:
import csv, json, glob, pickle
from collections import Counter
c = Counter()
for filename in glob.glob('./NYHT_clean/*.txt'):
with open(filename, 'r') as infile:
text = infile.read().lower()
for stop in stopwords:
text = text.replace(' '+stop+' ',' ')
for p in punct:
text = text.replace(p,'')
for d in digits:
text = text.replace(d,'')
c.update(text.split())
c = dict(c)
print(c)
In [ ]:
with open('NYHT.pickle', 'wb') as outfile:
pickle.dump(c,outfile)
In [ ]:
with open('NYHT.pickle', 'rb') as infile:
counts = pickle.load(infile)
print(counts)
matplotlib
is a frequently used graphing library for python.plt.plot()
takes two or more arguements. The data for the two axis is the minimal requirement..xlabel()
, .ylabel()
, and .title()
.matplotlib
. See the full documentation: Matplotlib User Guide
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
year = [91,92,93,94,95,96,97,98,99]
count = [1,2,3,4,5,6,7,8,9]
plt.plot(year, count)
plt.xlabel('Year')
plt.ylabel('Word Count')
plt.title('Frequency of Word x')
In [ ]:
import pandas as pd
df = pd.read_csv('./output/counts_by_date.csv', index_col='date')
totals = df.groupby('word').sum().sort_values(by=['count'],ascending=False)
print(totals,df)
In [ ]:
mrs = df[df.word == 'mrs'].groupby('date').sum()
mrs.plot.bar()
In [ ]:
terms = totals.loc[['mrs','mr','dr','prof']]
terms.plot.bar()
In [ ]:
totals.plot.hist(figsize=(20,5), bins=[x for x in range(0,200,10)])
In [ ]:
import json
data = open('YDN_counts.json','r')
count = json.load(data)
type(count)
In [ ]:
x = list(count['(gender|sex)'].keys())
y = list(count['(gender|sex)'].values())
#print(x,y)
plt.figure(figsize=(20,5))
plt.plot(x,y,'g--')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.title('Frequency of "gender" and "sex" in Yale Daily News (1959-1979)')
plt.savefig('YDN_gender.png', dpi=300)
In [ ]:
x2 = list(count['(coed|coeducation)'].keys())
y2 = list(count['(coed|coeducation)'].values())
plt.figure(figsize=(20,5))
plt.plot(x,y,'g--', label = "SEX")
plt.plot(x2,y2,'b', label = "COED")
plt.legend(loc='upper left')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.title('Frequency of Words in Yale Daily News (1959-1979)')
plt.savefig('YDN_terms.png', dpi=300)
In [ ]:
from datetime import datetime
date = 'November 8, 2008'
isoDate = datetime.strptime(date, '%B %d, %Y')
print(isoDate, type(isoDate))
strDate = isoDate.strftime('%Y%m%d')
print(isoDate, strDate)
In [ ]: