Here we import all of our wordlists and add them to an array which me can merge at the end.
This wordlists should not be filtered at this point. However they should all contain the same columns to make merging easier for later.
In [2]:
wordlists = []
In [3]:
!head -n 20 de-en.txt
In [4]:
import pandas as pd
dictcc_df = pd.read_csv("de-en.txt",
sep='\t',
skiprows=8,
header=None,
names=["GermanWord","Word","WordType"])
In [5]:
dictcc_df[90:100]
Out[5]:
In [6]:
dictcc_df = dictcc_df[["Word", "WordType"]][:].copy()
In [7]:
word_types = dictcc_df["WordType"].astype('category')
dictcc_df["WordType"] = word_types
# show data types of each column in the dataframe
dictcc_df.dtypes
Out[7]:
In [8]:
# nltk TaggedCorpusParses requires uppercase WordType
dictcc_df["WordType"] = dictcc_df["WordType"].str.upper()
dictcc_df["WordType"].value_counts().head()
Out[8]:
In [9]:
wordlists.append(dictcc_df)
In [10]:
# the readme file in `nltk/corpora/moby/mpos` gives some information on how to parse the file
result = []
# replace all DOS line endings '\r' with newlines then change encoding to UTF8
moby_words = !cat nltk/corpora/moby/mpos/mobyposi.i | iconv --from-code=ISO88591 --to-code=UTF8 | tr -s '\r' '\n' | tr -s '×' '/'
result.extend(moby_words)
moby_df = pd.DataFrame(data = result, columns = ['Word'])
In [288]:
moby_df.tail(10)
Out[288]:
In [12]:
# Matches nouns
nouns = moby_df[moby_df["Word"].str.contains('/[Np]$')].copy()
nouns["WordType"] = "NOUN"
# Matches verbs
verbs = moby_df[moby_df["Word"].str.contains('/[Vti]$')].copy()
verbs["WordType"] = "VERB"
# Magtches adjectives
adjectives = moby_df[moby_df["Word"].str.contains('/A$')].copy()
adjectives["WordType"] = "ADJ"
In [13]:
nouns["Word"] = nouns["Word"].str.replace(r'/N$','')
verbs["Word"] = verbs["Word"].str.replace(r'/[Vti]$','')
adjectives["Word"] = adjectives["Word"].str.replace(r'/A$','')
# Merge nouns, verbs and adjectives into one dataframe
moby_df = pd.concat([nouns,verbs,adjectives])
In [284]:
wordlists.append(moby_df)
In [14]:
wordlist = pd.concat(wordlists)
In [15]:
wordlist_filtered = wordlist[wordlist["WordType"].notnull()]
In [16]:
# we choose [a-z] here and not [A-Za-z] because we do _not_
# want to match words starting with uppercase characters.
# ^to matches verbs in the infinitive from `dictcc`
word_chars = r'^[a-z]+$|^to\s'
is_word_chars = wordlist_filtered["Word"].str.contains(word_chars, na=False)
wordlist_filtered = wordlist_filtered[is_word_chars]
wordlist_filtered.describe()
wordlist_filtered["WordType"].value_counts()
Out[16]:
In [17]:
lt_x_letters = (wordlist_filtered["Word"].str.len() < 9) |\
((wordlist_filtered["Word"].str.contains('^to\s\w+\s')) &\
(wordlist_filtered["Word"].str.len() < 11)\
)
wordlist_filtered = wordlist_filtered[lt_x_letters]
wordlist_filtered.describe()
Out[17]:
In [18]:
wordlist_filtered = wordlist_filtered.drop_duplicates("Word")
wordlist_filtered.describe()
wordlist_filtered["WordType"].value_counts()
Out[18]:
In [20]:
# The TaggedCorpusReader likes to use the forward slash character '/'
# as seperator between the word and part-of-speech tag (WordType).
wordlist_filtered.to_csv("dictcc_moby.csv",index=False,sep="/",header=None)
In [21]:
from nltk.corpus import TaggedCorpusReader
from nltk.tokenize import WhitespaceTokenizer
nltk_wordlist = TaggedCorpusReader("./", "dictcc_moby.csv")
In [178]:
# Our custom wordlist
import nltk
custom_cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in nltk_wordlist.tagged_words() if len(word) < 9 and word.isalpha)
In [179]:
# Brown Corpus
import nltk
brown_cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in nltk.corpus.brown.tagged_words() if word.isalpha() and len(word) < 9)
In [196]:
# Merge Nouns from all wordlists
nouns = set(brown_cfd["NN"]) | set(brown_cfd["NP"]) | set(custom_cfd["NOUN"])
# Lowercase all words to remove duplicates
nouns = set([noun.lower() for noun in nouns])
print("Total nouns count: " + str(len(nouns)))
In [195]:
# Merge Verbs from all wordlists
verbs = set(brown_cfd["VB"]) | set(brown_cfd["VBD"]) | set(custom_cfd["VERB"])
# Lowercase all words to remove duplicates
verbs = set([verb.lower() for verb in verbs])
print("Total verbs count: " + str(len(verbs)))
In [197]:
# Merge Adjectives from all wordlists
adjectives = set(brown_cfd["JJ"]) | set(custom_cfd["ADJ"])
# Lowercase all words to remove duplicates
adjectives = set([adjective.lower() for adjective in adjectives])
print("Total adjectives count: " + str(len(adjectives)))
In [266]:
def populate_degrees(nouns):
degrees = {}
nouns_copy = nouns.copy()
for latitude in range(60):
for longtitude in range(190):
degrees[(latitude,longtitude)] = nouns_copy.pop()
return degrees
In [267]:
def populate_minutes(verbs):
minutes = {}
verbs_copy = verbs.copy()
for latitude in range(60):
for longtitude in range(60):
minutes[(latitude,longtitude)] = verbs_copy.pop()
return minutes
In [268]:
def populate_seconds(adjectives):
seconds = {}
adjectives_copy = adjectives.copy()
for latitude in range(60):
for longtitude in range(60):
seconds[(latitude,longtitude)] = adjectives_copy.pop()
return seconds
In [269]:
def populate_fractions(nouns):
fractions = {}
nouns_copy = nouns.copy()
for latitude in range(10):
for longtitude in range(10):
fractions[(latitude,longtitude)] = nouns_copy.pop()
return fractions
In [271]:
def placewords(degrees,minutes,seconds,fractions):
result = []
result.append(populate_degrees(nouns).get(degrees))
result.append(populate_minutes(verbs).get(minutes))
result.append(populate_seconds(adjectives).get(seconds))
result.append(populate_fractions(nouns).get(fractions))
return "-".join(result)
In [281]:
# Located at 50°40'47.9" N 10°55'55.2" E
ilmenau_home = placewords((50,10),(40,55),(47,55),(9,2))
print("Feel free to stalk me at " + ilmenau_home)
from nltk.corpus import stopwords
dif = set(wordlist_filtered['Word']) - set(stopwords.words('english'))
names = nltk.corpus.names
names.fileids()
We want to remove all names and animals
We want to remove words that are difficult to spell
We want to remove homonyms that are used in different parts of speech (example: saw (as verb) and saw (as noun))
We want to remove arcane and unusual words
import nltk
def unusual_words(text):
text_vocab = set(w.lower() for w in text if w.isalpha())
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
unusual = text_vocab - english_vocab
return sorted(unusual)