In [22]:
import os, sys, string

In [14]:
data_dir = "../test_data"

data_files = os.listdir(data_dir)
data_files


Out[14]:
['doc1.txt', 'doc2.txt', 'doc3.txt', 'doc4.txt', 'doc5.txt', 'doc6.txt']

In [18]:
all_line_numbers = []
                                 
for file_name in data_files:
    print("file:", file_name)
    file_path = os.path.join(data_dir, file_name)
        
    n_lines = 0
     
    with open(file_path) as file:
        for line in file:
            n_lines += 1
            
        print("lines:", n_lines)
        all_line_numbers.append(n_lines)

print("AVERAGE LINE NUMBER:", sum(all_line_numbers)/len(all_line_numbers))


file: doc1.txt
lines: 46
file: doc2.txt
lines: 88
file: doc3.txt
lines: 77
file: doc4.txt
lines: 53
file: doc5.txt
lines: 56
file: doc6.txt
lines: 46
AVERAGE LINE NUMBER: 61.0

In [19]:
found_symbols = set()
all_symbols = set(string.punctuation)

all_line_numbers = []
                                 
for file_name in data_files:
    file_path = os.path.join(data_dir, file_name)
     
    with open(file_path) as file:
        for line in file:
            for char in line:
                if char in all_symbols:
                    if char not in found_symbols:
                        found_symbols.add(char)

print("ALL SYMBOLS FOUND:", found_symbols)


ALL SYMBOLS FOUND: {':', '%', '-', ',', '?', '"', '$', '/', ';', '!', "'", '.'}

In [24]:
found_symbols = set()
all_symbols = set(string.punctuation)

total_sentence_length = 0
total_sentences = 0
max_sentence_length = 0
min_sentence_length = sys.maxsize
                                 
for file_name in data_files:
    file_path = os.path.join(data_dir, file_name)
     
    with open(file_path) as file:
        for line in file:
            sentence_length = len(line)
            
            total_sentences += 1
            total_sentence_length += sentence_length
            
            if sentence_length > max_sentence_length:
                max_sentence_length = sentence_length
                
            if sentence_length < min_sentence_length:
                min_sentence_length = sentence_length

print("Average Sentence Length:", total_sentence_length / total_sentences)
print("Max sentence length:", max_sentence_length)
print("Min sentence length:", min_sentence_length)


Average Sentence Length: 306.7295081967213
Max sentence length: 954
Min sentence length: 10

In [27]:
for file_name in data_files:
    file_path = os.path.join(data_dir, file_name)
     
    with open(file_path) as file:
        for line in file:
            words = line.split()
            for word in words:
                if '-' in word:
                    print(word)
            
        print("lines:", n_lines)
        all_line_numbers.append(n_lines)


-
lay-people
-
-
-
-
-
-
-
-
-
-
-
-
ill-conceived
-
hard-earned
middle-class
fuel-efficient
-
-
-
-
ten-point
-
-
self-made
-
-
lines: 46
-
--
-
-
well-off
well-known,
-
-
--
-
-
-
-
-
-
-
-
middle-class
-
-
-
-
-
-
middle-management,
-
-
start-ups
high-wage,
high-tech
-
-
middle-class.
twenty-six
fuel-efficiency
stop-gap
long-term
re-tool,
fuel-efficient
-
world-class
-
-
-
twenty-first
-
Commander-in-Chief,
-
-
Bush-McCain
--
-
Commander-in-Chief,
-
worn-out
-
gang-violence
AK-47s
same-sex
-
-
nay-sayers
-
-
re-enlist
-
-
-
-
-
-
-
lines: 46
-
-they
-
-
-
one-hundred-and-five-year-old
African-American
middle-class
-
-
-
-
middle-class
high-school
college-educated
bottom-lines
-
-
flag-draped
can't-do,
won't-do,
won't-even-try
closed-door
nation-building.
-
-
-
-
-
-
-
-
-
-
-
world-class,
top-notch,
life-long
-
on-the-job
fuel-efficient,
-
105-year-old
twenty-first
-
-
lines: 46
low-income
tea-house
-
-
-
-
self-determination
-
post-colonial
-
-
-
-
-
--
self-interest
self-interest
-
-
-
-
job-creating
good-paying
--
-
-
-
-
-
-
-
-
-
-
-
-
-
-
ethnic-based
-
-
-
-
anti-corruption
-
-
-
-
-
-
-
-
-
-
-
lines: 46
soul-trying
self-interest
policy-by-slogan
seventy-five
well-respected
all-time
two-thirds
hard-earned
overly-rigid.
over-the-horizon
--
much-needed
ill-conceived,
half-finished
no-bid
condition-based
--
flag-draped
lines: 46
co-founder
non-descript
first-floor
mini-refrigerator.
co-founded
-
-
-
-
-
non-governmental
Nunn-Lugar
Soviet-supplied,
low-enriched
-
-
--
Nunn-Lugar
-
-
Nunn-Lugar
-
Nunn-Lugar
non-proliferation
bio-reconnaissance
bio-attack
-
-
next-generation
non-proliferation
-
-
U.S.-Russian
Nunn-Lugar
--
--
senior-level
Nunn-Lugar
differences--but
-
lines: 46

In [ ]: