Python [conda env:PY36]
Python [conda env:PY27_Test]

File Data Processing - Object Code

This code was created in Python 2.7 and cross-tested in Python 3.6. This is version 1.0 of this code. Comments include design ideas for future versions of this code.


In [1]:
''' Future design ideas:
    * add in file size and store it for easy access
    * if file size too big (compared to set limit), do not allow 
      getAllFileData (w/ warning to user)
    * allow user to set limit on file size for gellAll (but obj has default)
    * error handling for EOF on getLineFromFile?
    * can object track getLineFromFile calls and tell us what line we are on?
        
'''

class FileData(object):
    '''Bundles data from a text file with utility methods wrapped around a standard file object.\n
       Through myFileDataObj.f we can access all methods and attributes of standard file object.
    '''
    def __init__(self, xfilename):
        self.filename = xfilename
        self._mode = 'r'  # this object will access files in read only mode by default
                          # _ is a hint that you should not really be changing this as per SO posts
        self.f = open(xfilename, 'r')
        self.lines = []
        
    def getAllFileData(self):
        '''Stores all data from the file in a "lines" list within the FileData object.'''        
        if self.f.closed:
            self.f = open(self.filename, 'r')        
        self.lines = self.f.readlines()
        self.f.close()
        
    def passAllFileData(self):
        '''Passes all file data from inside the FileData object to the outside (clears FileData.lines and returns it.)'''
        linesCopy = self.lines
        self.lines = []
        return linesCopy
        
    def getLineFromFile(self, raw = False, startIndex = False):
        '''Get next line from file.  Keeps file open for next function call.'''        
        # startIndex = index position of first character in line (not the line #)
        if self.f.closed:
            self.f = open(self.filename, 'r') 

        i = self.f.tell()
        if raw == True:
            lnOut = ("%r" %(self.f.readline())).strip("'")  
                                               # [1:-1] worked briefly & then stopped working
        else:
            lnOut = self.f.readline()
        if startIndex == True:
            lnOut = "[%d] " %(i) + lnOut
        
        return lnOut
        
        
    def fileHead(self, numRows = 6):
        '''Returns the header from a text file. numRows tells it how many rows to return.'''
        hdtitle = "%r" %"Raw Content (newlines will display as '\n'):"
        if len(self.lines) <= 1:
            if self.f.closed:
                self.f = open(self.filename, 'r')
                
        print(hdtitle[1:-1]) # raw outputs with quotes, we take them off   
        for i in range(numRows):
            lnOut = ("%r" %(self.f.readline())).strip("'")
            lnOut = "[%d] " %(i) + lnOut
                     
            '''
            By default, r% adds single quotes around the content.  
            If this is desirable change the above code to this:
              lnOut = "[%d] %r" %(i, self.f.readline())
            '''
            print(lnOut)
        self.f.close()

In [2]:
import re
import collections

# these functions make more sense sharing them across objects:

def addToNestedDictionary(d, tup):
    '''This function adds a tupple to a dictionary by making it a nested dictionary.'''
    # http://stackoverflow.com/questions/8550912/python-dictionary-of-dictionaries
    if tup[0] not in d:
        d[tup[0]] = {}
    d[tup[0]][tup[1]] = tup[2]   # to embed in a list: [tup[2]]

def output_NestedDictSummary(nstdDict, descr1 = "People in the Chat Log", 
                                       descr2 = "People each sent this many messages",
                                       descr3 = "chat records"):
    '''Print summary of sub-dictionaries stored in Nested Dictionary.'''
    
    # Dictionary summarizes what?
    #   descr1, descr2, descr3 (for dictionary and subctionary records)
    #   test with default values to see how to use the arguments
    
    print("Number of %s: %s" %(descr1, len(nstdDict)))
    print("%s:" %(descr2))

    for eachKey in nstdDict:
        outStr = eachKey + ": " + str(len(nstdDict[eachKey]))
        print("\t%s" %outStr)
    print("\tTotal: %d %s" %(sum(len(v) for v in nstdDict.values()), descr3))  # itervalues

class chatLogOjb(object):
    def __init__(self, fileLinesLst, pattern1 = "^\D+\s\d\d:\d\d:\d\d\s$", pattern2 = "\W+"):
        # patterns set by default but user can over-ride them if format of chat log fie changes
        self.pattern1 = pattern1
        self.pattern2 = pattern2
        self.data = fileLinesLst
        self._debug_switch = False
        self.createChatDictionary()
    
    def getChatMeta(self, pattern, strng):
        tmp = re.split(pattern, strng)
        return (tmp[0], [tmp[1],tmp[2],tmp[3]])
                 
    def createChatDictionary(self):
        # pop(i) removes the cited index from original while adding it to the new lists
        chatMsgs = [i[:-1] for i in self.data if not re.search(self.pattern1, i)]
        chatMeta = [self.getChatMeta(self.pattern2, i) for i in self.data if re.search(self.pattern1, i)]
        
        if self._debug_switch == True:
            print(len(self.data))
            print(chatMsgs[0:5])
            print(chatMeta[0:5])
        
        self.data = {}
        
        for i in range(0, len(chatMsgs)):
            tup = (chatMeta[i][0], chatMeta[i][1][0]+":"+chatMeta[i][1][1]+":"+chatMeta[i][1][2], chatMsgs[i])
            addToNestedDictionary(self.data, tup)

        self.data = collections.OrderedDict(sorted(self.data.items()))  # sort the names in outer dictionary
        for eachKey in self.data:                                       # sort records in inner dictionary
            self.data[eachKey] = collections.OrderedDict(sorted(self.data[eachKey].items(), reverse=True))
            # this sort worked on all tests that follow, but may want to explore this alternative for v2:
            #     OrderedDict(sorted(dict.items(), key=lambda v: v[1]))
            #     to reverse it: OrderedDict(sorted(dict.items(), key=lambda v: (-v[1], v[0])))

In [3]:
chatLogFile = FileData("data/record.txt")
chatLogFile.fileHead(13)


Raw Content (newlines will display as '\n'):
[0] Jack 17:26:46\n
[1] The slogan duplicates a bliss after the plastic accent.\n
[2] Michael 17:26:47\n
[3] A tidied successor enlightens the advertised contract.\n
[4] William 17:26:48\n
[5] The secondary freedom matures beneath your suitable tip.\n
[6] John 17:26:50\n
[7] A wonderful newcomer expires on top of a cryptic entrance.\n
[8] William 17:26:51\n
[9] A bastard degenerates before the chorus!\n
[10] Emily 17:26:52\n
[11] A patronized winter safeguards the inheritance in the exceptional ashcan.\n
[12] Jack 17:26:53\n

In [4]:
chatLogFile.fileHead(3)


Raw Content (newlines will display as '\n'):
[0] Jack 17:26:46\n
[1] The slogan duplicates a bliss after the plastic accent.\n
[2] Michael 17:26:47\n

In [5]:
# as per links above ... we can close this twice without error ... once is enough though
# this cell is a debug and testing cell used to ensure we don't leave copies open
# and that "if closed" is firing right inside methods of the object ...

chatLogFile.f.close()
chatLogFile.f.close()

In [6]:
# quick test
chatLogFile.getAllFileData()
print(chatLogFile.lines[10])


Emily 17:26:52


In [7]:
# more testing of object methods

print(chatLogFile.getLineFromFile())
print(chatLogFile.getLineFromFile(raw = True))
print(chatLogFile.getLineFromFile(raw = True, startIndex = True))
filePosition = chatLogFile.f.tell()
print("current position: ", filePosition)
print(chatLogFile.getLineFromFile(True, True))  
      # note:  tell() is reporting back [index position] not line number (in this code)


Jack 17:26:46

The slogan duplicates a bliss after the plastic accent.\n
[72] Michael 17:26:47\n
('current position: ', 90L)
[90] A tidied successor enlightens the advertised contract.\n

In [8]:
# illustration of behavior that getLineFromFile() reports file character index, not line number:
print(chatLogFile.lines[filePosition])
print(chatLogFile.lines[filePosition+1])


Emma 17:27:54

Why won't the bugger steam beneath the planned stranger?


In [10]:
# chatLogFile = FileData("data/record.txt")  #did this in earlier test
chatLogFile.getAllFileData()
chatLog = chatLogOjb(chatLogFile.passAllFileData())   
#         strange glitch to investigate later:
#         run this cell once and you get 918 or a number that is slightly less than 922
#         run 2nd time and repeatedly and you get right count of 922 thereafter
output_NestedDictSummary(chatLog.data)
print("Entire Dictionary is sorted.  Messages are in Time order.  Here are some samples.")
print("")


Number of People in the Chat Log: 10
People each sent this many messages:
	Daniel: 77
	Elizabeth: 89
	Emily: 122
	Emma: 87
	Jack: 90
	Jayden: 83
	John: 99
	Mary: 86
	Michael: 98
	William: 91
	Total: 922 chat records
Entire Dictionary is sorted.  Messages are in Time order.  Here are some samples.


In [11]:
# modify text lines in summary output (testing functionality):
output_NestedDictSummary(chatLog.data, "Records in ChatLogData", 
                         "Records attributed to each person", "total messages")


Number of Records in ChatLogData: 10
Records attributed to each person:
	Daniel: 77
	Elizabeth: 89
	Emily: 122
	Emma: 87
	Jack: 90
	Jayden: 83
	John: 99
	Mary: 86
	Michael: 98
	William: 91
	Total: 922 total messages

In [12]:
print("Rows in chatLogFile.lines: %d" %len(chatLogFile.lines)) 
                               # confirm we cleared out the chatLogFile.lines object
chatLog.data['Daniel']         # sample person's chat messages


Rows in chatLogFile.lines: 0
Out[12]:
OrderedDict([('18:02:46', 'The rocket skips above the mother.'),
             ('18:02:32',
              'The cream differential scratches against the objective dot.'),
             ('18:02:15', 'An arc festival prevails.'),
             ('18:02:07', 'A nut graduates!'),
             ('18:00:54', 'Each last reluctance shelters a dated microwave.'),
             ('17:59:31', 'An ally recognizes a withdrawn theft.'),
             ('17:58:11', 'A mouth fusses opposite her interior.'),
             ('17:58:07',
              "Why can't the charitable ink edge the nearest judge?"),
             ('17:57:50',
              'A virgin undergoes a rotate biscuit above a famine.'),
             ('17:57:23', 'The muscle bubbles behind a comfort!'),
             ('17:56:27', 'The home objective requests the wasted magazine.'),
             ('17:56:14',
              'A manager refines the spoilt dirt throughout the circuit.'),
             ('17:56:04', 'The rubbish repairs a drug after the dark skip.'),
             ('17:55:59', 'How will a conference beam?'),
             ('17:55:56', 'The monthly castle flashes after the thesis.'),
             ('17:55:54', 'My luck inconveniences a dimensional storm.'),
             ('17:55:05', 'The kidney coordinates an egg.'),
             ('17:55:01',
              'How can whatever adjacent arc meet the warm career?'),
             ('17:54:53', 'The feeling strength advances.'),
             ('17:54:52', 'The wild brigade blasts a flowing triumph.'),
             ('17:54:50', "Why can't the dinner stamp?"),
             ('17:54:24', 'The air scores past the overhead.'),
             ('17:54:00', 'The lavatory repairs the zero.'),
             ('17:53:39', 'The excessive giant behaves before a glance.'),
             ('17:53:37', 'The potato offers your freed faucet.'),
             ('17:53:27',
              'A suffering result attacks against the ineffective snow.'),
             ('17:52:08', 'The knight arrives.'),
             ('17:51:47', 'Opposite the assault riots a merged breed.'),
             ('17:51:36', 'A vet discriminates near a shocked capital.'),
             ('17:50:59',
              'Our orchestral needle presses against his married researcher.'),
             ('17:50:48', 'The image consents behind the highway.'),
             ('17:50:46', 'Our dip whistles before the parent!'),
             ('17:50:44', "Why can't a poet course an unbalanced audio?"),
             ('17:50:34', 'The apparent patient dies under a borderline.'),
             ('17:50:25', 'In the gospel screams the unwelcome dish.'),
             ('17:49:36', 'When can the shoulder dance the horizontal axis?'),
             ('17:49:24', 'The opponent credits a suicidal school.'),
             ('17:49:05', 'The peripheral winds the general.'),
             ('17:48:59', 'The suicide sacks the rock.'),
             ('17:48:43',
              'How will the projecting capital scream behind the century?'),
             ('17:48:08', 'An impending bug chews around a competitor.'),
             ('17:46:48',
              'Any willed appearance rages opposite the subsidized representative.'),
             ('17:46:41', 'The identified buss cruises above a funeral.'),
             ('17:46:37',
              'Every blank priest warns a union below the assuring basket.'),
             ('17:46:36', 'A pool suffers a smiling custard.'),
             ('17:46:19',
              "Why won't the cracking choir rest opposite the ideal?"),
             ('17:46:11', 'A golden appeal promotes our adjusted sock.'),
             ('17:45:55', 'Our deposit counts!'),
             ('17:45:46', 'Any gathered rat bolts.'),
             ('17:45:41', 'This constituent stunts a tile.'),
             ('17:45:33',
              'When will my opponent recover below a witnessed blessed?'),
             ('17:44:46', 'A damaging override reverts.'),
             ('17:44:44', 'A postponed prostitute stunts a science.'),
             ('17:44:10', 'A charge reads.'),
             ('17:43:54', 'The translator tanks a worst sword.'),
             ('17:43:34', 'The terrorist grows inside a felt doctrine.'),
             ('17:34:09', 'My angry squad grasps the twin.'),
             ('17:33:58',
              'The converted volume scratches underneath the very chart.'),
             ('17:33:57',
              'An angel silences the sung terror against a handful.'),
             ('17:33:16',
              'An applicable predecessor stumbles next to a wallet.'),
             ('17:33:13', 'Does the recruit camp?'),
             ('17:33:05',
              'How will the summary faithful scream before the flavor?'),
             ('17:32:29', 'The shot rants beneath a ruin!'),
             ('17:32:02',
              'Will the romantic center lift up the size elevator?'),
             ('17:31:33', 'The deaf girlfriend scores the listed tactic.'),
             ('17:30:49', 'The soap bolts before the rubbish.'),
             ('17:30:26',
              'The alternative irritates the phrase in the twisted chase.'),
             ('17:30:12',
              'An enforced arithmetic licenses a unexpected whale.'),
             ('17:28:38', 'Will the rectified champion duck?'),
             ('17:28:16',
              'The highway fries the whim beside the amended prostitute.'),
             ('17:28:01', 'A hardship pops underneath the baking hate.'),
             ('17:27:52', 'Any encouraging stunt pauses throughout a friend.'),
             ('17:27:48', 'A silent blackmail apologizes opposite the storm.'),
             ('17:27:40', 'Underneath the welcome offends the rising studio.'),
             ('17:27:29',
              'The compelling alert transports the cider around an unsolicited terrorist.'),
             ('17:27:10', 'Should a disco suspect the box?'),
             ('17:27:08', 'A gullible tear quits the violin.')])

Research and References

These posts were done on Stack Overflow to understand things about this code during its development. They also have useful recommendations and explanations about how aspects of the code and coding principals work:

Not Used

These objects were created but not used in the actual solution


In [13]:
class chatRecord(object):
    def __init__(self, xSpeaker, xrecTime, xrecMsg):
        self.Speaker = xSpeaker 
        self.recTime = xrecTime 
        self.recMsg  = xrecMsg

In [ ]: