In [2]:
%pylab inline
import pandas as pd
import os


Populating the interactive namespace from numpy and matplotlib

In [5]:
full_path = '/home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03'

In [3]:
words = ['architecture', 'technology', 'temperature', 'academics', 'concurrent', 'experiment', 'catalogue', 'hierarchy']

In [12]:
for word in words:
    cmd = "grep -w '^{0}' {1}/fulldataoutput_18march_lowercase_v3_sorted.txt > {1}/word_indexes/raw/{0}.txt".format(word, full_path)
    os.system(cmd)
    print(cmd)


grep -w '^architecture' /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/fulldataoutput_18march_lowercase_v3_sorted.txt > /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/word_indexes/raw/architecture.txt
grep -w '^technology' /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/fulldataoutput_18march_lowercase_v3_sorted.txt > /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/word_indexes/raw/technology.txt
grep -w '^temperature' /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/fulldataoutput_18march_lowercase_v3_sorted.txt > /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/word_indexes/raw/temperature.txt
grep -w '^academics' /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/fulldataoutput_18march_lowercase_v3_sorted.txt > /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/word_indexes/raw/academics.txt
grep -w '^concurrent' /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/fulldataoutput_18march_lowercase_v3_sorted.txt > /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/word_indexes/raw/concurrent.txt
grep -w '^experiment' /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/fulldataoutput_18march_lowercase_v3_sorted.txt > /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/word_indexes/raw/experiment.txt
grep -w '^catalogue' /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/fulldataoutput_18march_lowercase_v3_sorted.txt > /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/word_indexes/raw/catalogue.txt
grep -w '^hierarchy' /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/fulldataoutput_18march_lowercase_v3_sorted.txt > /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/word_indexes/raw/hierarchy.txt

BIGRAM


In [13]:
words = ['computer science', 'information retrieval', 'power politics', 
         'los angeles', 'bruce willis']

In [16]:
for word in words:
    cmd = "grep -w '^{0}' {1}/devdataoutput.txt > {1}/word_indexes/bigram/{2}.txt".format(word, full_path, 
                                                                                          word.replace(' ', '_'))
    os.system(cmd)
    print(cmd)


grep -w '^computer science' /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/devdataoutput.txt > /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/word_indexes/bigram/computer_science.txt
grep -w '^information retrieval' /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/devdataoutput.txt > /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/word_indexes/bigram/information_retrieval.txt
grep -w '^power politics' /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/devdataoutput.txt > /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/word_indexes/bigram/power_politics.txt
grep -w '^los angeles' /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/devdataoutput.txt > /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/word_indexes/bigram/los_angeles.txt
grep -w '^bruce willis' /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/devdataoutput.txt > /home/cmb-panasas2/skchoudh/github_projects/hatex/2019_Spring/CSCI-572/HW03/word_indexes/bigram/bruce_willis.txt

In [ ]: