In [1]:
%%bash
whoami


rlyons

In [2]:
%whos


Interactive namespace is empty.

In [3]:
import numpy as np
import os
import matplotlib as plt

In [4]:
%whos


Variable   Type      Data/Info
------------------------------
np         module    <module 'numpy' from '/ho<...>kages/numpy/__init__.py'>
os         module    <module 'os' from '/home/<...>da3/lib/python3.6/os.py'>
plt        module    <module 'matplotlib' from<...>/matplotlib/__init__.py'>

In [5]:
%%bash
echo 'hi'


hi

In [6]:
file_path = os.path.expanduser('~/swc_hlt_9-17/swc_hlt_9-17/HLTSWFiles/tomsawyer.txt')
print(file_path)


/home/rlyons/swc_hlt_9-17/swc_hlt_9-17/HLTSWFiles/tomsawyer.txt

In [7]:
with open(file_path, 'r') as f:
    line1 = f.readline()
    print(line1)


The Project Gutenberg EBook of The Adventures of Tom Sawyer, Complete


In [8]:
fname = os.path.expanduser('~/swc_hlt_9-17/swc_hlt_9-17/HLTSWFiles/data/inflammation-01.csv')

In [9]:
my_array = np.loadtxt(fname, delimiter=',')
print(my_array, '\n', my_array.size, '\n', my_array.shape, '\n', my_array.dtype, '\n', my_array[0, 0], '\n', my_array[0:5, :10])


[[ 0.  0.  1. ...,  3.  0.  0.]
 [ 0.  1.  2. ...,  1.  0.  1.]
 [ 0.  1.  1. ...,  2.  1.  1.]
 ..., 
 [ 0.  1.  1. ...,  1.  1.  1.]
 [ 0.  0.  0. ...,  0.  2.  0.]
 [ 0.  0.  1. ...,  1.  1.  0.]] 
 2400 
 (60, 40) 
 float64 
 0.0 
 [[ 0.  0.  1.  3.  1.  2.  4.  7.  8.  3.]
 [ 0.  1.  2.  1.  2.  1.  3.  2.  2.  6.]
 [ 0.  1.  1.  3.  3.  2.  6.  2.  5.  9.]
 [ 0.  0.  2.  0.  4.  2.  2.  1.  6.  7.]
 [ 0.  1.  1.  3.  3.  1.  3.  5.  2.  4.]]

In [10]:
small_array = my_array[:5, :10]

In [11]:
small_array * 2


Out[11]:
array([[  0.,   0.,   2.,   6.,   2.,   4.,   8.,  14.,  16.,   6.],
       [  0.,   2.,   4.,   2.,   4.,   2.,   6.,   4.,   4.,  12.],
       [  0.,   2.,   2.,   6.,   6.,   4.,  12.,   4.,  10.,  18.],
       [  0.,   0.,   4.,   0.,   8.,   4.,   4.,   2.,  12.,  14.],
       [  0.,   2.,   2.,   6.,   6.,   2.,   6.,  10.,   4.,   8.]])

In [12]:
word_file = os.path.expanduser('~/swc_hlt_9-17/swc_hlt_9-17/HLTSWFiles/words')

In [13]:
with open(word_file, 'r') as f:
    words = f.readlines()

In [14]:
print('The number of words is:', len(words))


The number of words is: 235886

In [15]:
np.loadtxt??

In [16]:
word_array = np.loadtxt(word_file, dtype='S25')

In [17]:
print(word_array[:20])


[b'A' b'a' b'aa' b'aal' b'aalii' b'aam' b'Aani' b'aardvark' b'aardwolf'
 b'Aaron' b'Aaronic' b'Aaronical' b'Aaronite' b'Aaronitic' b'Aaru' b'Ab'
 b'aba' b'Ababdeh' b'Ababua' b'abac']

In [18]:
preface_file = os.path.expanduser('~/swc_hlt_9-17/swc_hlt_9-17/HLTSWFiles/tompreface.txt')

In [29]:
#word_list = []
other_words = []
other_chars = []

with open(preface_file, 'r') as f:
    """
    lines = f.readlines()
    for line in lines:
        for word in line.strip().split(' '):
            print(word[-3:])
            """
    for line in f.readlines():
        l = line.split(' ')
        for w in l:
            if (l.index(w) % 2) == 0: #change to ignore even-indices on ending lines
                other_words.append(w.strip())
            else:
                continue
                
    for w in other_words:
        for i in range(len(w)):
            if (i%2) == 0:
                other_chars.append(w[i])

print(other_words)
print(other_chars)


['MOST', 'the', 'recorded', 'this', 'really', 'one', 'two', 'experiences', 'my', 'the', 'those', 'boys', 'were', 'schoolmates', 'mine.', 'Finn', 'drawn', 'life;', 'Sawyer', 'but', 'not', 'an', 'is', 'combination', 'the', 'of', 'three', 'whom', 'knew,', 'therefore', 'to', 'composite', 'of', 'architecture.', '', 'The', 'superstitions', 'upon', 'all', 'among', 'and', 'in', 'West', 'of', 'story--that', 'to', 'thirty', 'forty', 'ago.', '', 'Although', 'book', 'intended', 'for', 'entertainment', 'boys', 'girls,', 'hope', 'will', 'be', 'by', 'and', 'on', 'account,', 'for', 'of', 'plan', 'been', 'try', 'pleasantly', 'adults', 'of', 'what', 'they', 'were', 'and', 'how', 'they', 'felt', 'and', 'thought', 'and', 'talked,', 'and', 'queer', 'they', 'engaged']
['M', 'S', 't', 'e', 'r', 'c', 'r', 'e', 't', 'i', 'r', 'a', 'l', 'o', 'e', 't', 'o', 'e', 'p', 'r', 'e', 'c', 's', 'm', 't', 'e', 't', 'o', 'e', 'b', 'y', 'w', 'r', 's', 'h', 'o', 'm', 't', 's', 'm', 'n', '.', 'F', 'n', 'd', 'a', 'n', 'l', 'f', ';', 'S', 'w', 'e', 'b', 't', 'n', 't', 'a', 'i', 'c', 'm', 'i', 'a', 'i', 'n', 't', 'e', 'o', 't', 'r', 'e', 'w', 'o', 'k', 'e', ',', 't', 'e', 'e', 'o', 'e', 't', 'c', 'm', 'o', 'i', 'e', 'o', 'a', 'c', 'i', 'e', 't', 'r', '.', 'T', 'e', 's', 'p', 'r', 't', 't', 'o', 's', 'u', 'o', 'a', 'l', 'a', 'o', 'g', 'a', 'd', 'i', 'W', 's', 'o', 's', 'o', 'y', '-', 'h', 't', 't', 't', 'i', 't', 'f', 'r', 'y', 'a', 'o', 'A', 't', 'o', 'g', 'b', 'o', 'i', 't', 'n', 'e', 'f', 'r', 'e', 't', 'r', 'a', 'n', 'e', 't', 'b', 'y', 'g', 'r', 's', 'h', 'p', 'w', 'l', 'b', 'b', 'a', 'd', 'o', 'a', 'c', 'u', 't', 'f', 'r', 'o', 'p', 'a', 'b', 'e', 't', 'y', 'p', 'e', 's', 'n', 'l', 'a', 'u', 't', 'o', 'w', 'a', 't', 'e', 'w', 'r', 'a', 'd', 'h', 'w', 't', 'e', 'f', 'l', 'a', 'd', 't', 'o', 'g', 't', 'a', 'd', 't', 'l', 'e', ',', 'a', 'd', 'q', 'e', 'r', 't', 'e', 'e', 'g', 'g', 'd']

In [49]:
def get_ngrams(word_list, n=2):
    ngrams = []
    word_list.insert(0, '<start>')
    word_list.append('<end>')
    for i in range(0, len(word_list)-(n-1)):
        gram = ' '.join(word_list[i:i+n])
        ngrams.append(gram)
        i += n
    return ngrams

In [50]:
get_ngrams(['Testing', 'ngram', 'getter', 'function', 'with', 'a', 'list', 'of', 'words.'])


Out[50]:
['<start> Testing',
 'Testing ngram',
 'ngram getter',
 'getter function',
 'function with',
 'with a',
 'a list',
 'list of',
 'of words.',
 'words. <end>']