In [22]:
import os, sys, string

In [14]:
data_dir = "../test_data"

data_files = os.listdir(data_dir)
data_files


Out[14]:
['doc1.txt', 'doc2.txt', 'doc3.txt', 'doc4.txt', 'doc5.txt', 'doc6.txt']

In [18]:
all_line_numbers = []
                                 
for file_name in data_files:
    print("file:", file_name)
    file_path = os.path.join(data_dir, file_name)
        
    n_lines = 0
     
    with open(file_path) as file:
        for line in file:
            n_lines += 1
            
        print("lines:", n_lines)
        all_line_numbers.append(n_lines)

print("AVERAGE LINE NUMBER:", sum(all_line_numbers)/len(all_line_numbers))


file: doc1.txt
lines: 46
file: doc2.txt
lines: 88
file: doc3.txt
lines: 77
file: doc4.txt
lines: 53
file: doc5.txt
lines: 56
file: doc6.txt
lines: 46
AVERAGE LINE NUMBER: 61.0

In [19]:
found_symbols = set()
all_symbols = set(string.punctuation)

all_line_numbers = []
                                 
for file_name in data_files:
    file_path = os.path.join(data_dir, file_name)
     
    with open(file_path) as file:
        for line in file:
            for char in line:
                if char in all_symbols:
                    if char not in found_symbols:
                        found_symbols.add(char)

print("ALL SYMBOLS FOUND:", found_symbols)


ALL SYMBOLS FOUND: {':', '%', '-', ',', '?', '"', '$', '/', ';', '!', "'", '.'}

In [23]:
found_symbols = set()
all_symbols = set(string.punctuation)

total_sentence_length = 0
total_sentences = 0
max_sentence_length = 0
min_sentence_length = sys.maxsize
                                 
for file_name in data_files:
    file_path = os.path.join(data_dir, file_name)
     
    with open(file_path) as file:
        for line in file:
            sentence_length = len(line)
            
            total_sentences += 1
            total_sentence_length += sentence_length
            
            if sentence_length > max_sentence_length:
                max_sentence_length = sentence_length
                
            if sentence_length < min_sentence_length:
                min_sentence_length = sentence_length

print("Average Sentence Length:", total_sentence_length / total_sentences)
print("Max sentence length:", max_sentence_length)
print("Min sentence length:", min_sentence_length)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-23-d1c56a240f80> in <module>()
      4 total_sentence_length = 0
      5 total_sentences = 0
----> 6 max_sentence_length = sys.minsize
      7 min_sentence_length = sys.maxsize
      8 

AttributeError: module 'sys' has no attribute 'minsize'

In [ ]: