Explore the different corpora


In [1]:
import sys
sys.path.append("../python/")

In [2]:
import pentoref.IO as IO

In [4]:
# Get the dataframes (can also use a database method instead)
dfwords, dfutts, dfrefs, dfscenes, dfactions = IO.convert_subcorpus_raw_data_to_dataframes("../../TAKECV_PENTOREF")


TAKECV
extracting files in textgrid format
17 Transcription files read.
1828 text files read. 1244 XML files parsed.
Processing:
r5_p2...
r5_p1...
r4_p1...
r4_p2...
r9_p1...
r2_p2...
r2_p1...
r6_p2...
r6_p1...
r3_p1...
r3_p2...
r7_p1...
r8_p2...
r7_p2...
r1_p2...
r1_p1...
r9_p2...
Done.

In [5]:
dfwords


Out[5]:
gameID position uttID word
0 r5_p2_1 1 1 das:
1 r5_p2_1 2 1 hellblaue
2 r5_p2_1 3 1 ..
3 r5_p2_1 1 2 Objekt
4 r5_p2_1 2 2 was
5 r5_p2_1 3 2 aus
6 r5_p2_1 4 2 drei
7 r5_p2_1 5 2 Rechtecken
8 r5_p2_1 6 2 besteht
9 r5_p2_1 7 2 befindet
10 r5_p2_1 8 2 sich
11 r5_p2_1 9 2 rechts
12 r5_p2_1 10 2 vom
13 r5_p2_1 11 2 gelben
14 r5_p2_1 12 2 $C
15 r5_p2_2 1 3 das
16 r5_p2_2 2 3 .
17 r5_p2_2 3 3 pinke
18 r5_p2_2 4 3 B
19 r5_p2_2 5 3 befindet
20 r5_p2_2 6 3 sich
21 r5_p2_2 7 3 links
22 r5_p2_2 8 3 .
23 r5_p2_2 9 3 vom
24 r5_p2_2 10 3 braunen
25 r5_p2_2 11 3 $B
26 r5_p2_3 1 4 das
27 r5_p2_3 2 4 pinke
28 r5_p2_3 3 4 $B
29 r5_p2_3 4 4 befindet
... ... ... ... ...
20849 r9_p2_91 2 143 $T
20850 r9_p2_91 3 143 (
20851 r9_p2_91 4 143 links
20852 r9_p2_91 5 143 von
20853 r9_p2_91 6 143 dem
20854 r9_p2_91 7 143 ..
20855 r9_p2_91 8 143 grünen
20856 r9_p2_91 9 143 $L
20857 r9_p2_91 10 143 ...
20858 r9_p2_91 11 143 äh
20859 r9_p2_91 12 143 links
20860 r9_p2_91 13 143 von
20861 r9_p2_91 14 143 dem
20862 r9_p2_91 15 143 orangenen
20863 r9_p2_91 16 143 $L
20864 r9_p2_91 17 143 )
20865 r9_p2_92 1 144 grünes
20866 r9_p2_92 2 144 $T
20867 r9_p2_92 3 144 rechts
20868 r9_p2_92 4 144 von
20869 r9_p2_92 5 144 dem
20870 r9_p2_92 6 144 orangenen
20871 r9_p2_92 7 144 $C
20872 r9_p2_93 1 145 dunkelbraunes
20873 r9_p2_93 2 145 $L
20874 r9_p2_93 1 146 links
20875 r9_p2_93 2 146 über
20876 r9_p2_93 3 146 dem
20877 r9_p2_93 4 146 lilanen
20878 r9_p2_93 5 146 Stein

20879 rows × 4 columns


In [5]:
from os import listdir
from os.path import isdir, abspath, join, splitext
from re import match, sub, findall, finditer
from lxml.etree import parse
from lxml.etree import XMLSyntaxError
import lxml.etree
from io import StringIO, BytesIO

In [6]:
parser = lxml.etree.XMLParser(recover=True) #recovers from bad characters.
#tree = lxml.etree.parse(filename, parser)

def get_data_from_xml(xmlpath):
    tiles = dict()
    landmarks = dict()
    scene_descriptions = dict()
    for folder in sorted(listdir(xmlpath)):
        for file in sorted(listdir(join(xmlpath,folder))):
            if not isdir(file):
                name,ext = splitext(file)
                if ext.lower() == '.txt':
                    if 'final-selected' in name:
                        tiles[name[:-15]] = (open(join(xmlpath,folder,file)).read())
                    elif 'landmark' in name:
                        landmarks[name[:-9]] = (open(join(xmlpath,folder,file)).read())
                elif ext.lower() == '.xml' and 's' not in name:
                    try:
                        #if folder == "r7": continue
                        xml = "".join([line for line in open(join(xmlpath,folder,file))])
                        xml = xml.encode("utf-8").decode("utf-8")
                        reader = StringIO(xml)
                        scene_descriptions[name] = xml #lxml.etree.iterparse(reader)
                    except XMLSyntaxError:
                        print "Error parsing file!"
                        print join(xmlpath,folder,file)
                        return None, None, None
    print(str(len(tiles.keys())+len(landmarks.keys()))+' text files read. '+str(len(scene_descriptions))+' XML files parsed.')
    return scene_descriptions

In [7]:
p = get_data_from_xml("/home/dsg-labuser/git/pentoref/PENTOCV_PENTOREF/derived_data/multimodal_data/scene_information")


0 text files read. 38563 XML files parsed.

In [14]:
import sys
import math

def convert_size(size_bytes):
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return "%s %s" % (s, size_name[i])
print p.values()[0]
convert_size(sys.getsizeof(p.values()[0]))


<lxml.etree._ElementTree object at 0xb1f9338c>
Out[14]:
'32.0 B'

In [5]:
parser = lxml.etree.XMLParser(recover=True) #recovers from bad characters.
p = parse(open("/home/dsg-labuser/git/pentoref/PENTOCV_PENTOREF/derived_data/multimodal_data/scene_information/r8/r8_818543.478261.xml"), parser)

In [6]:
p


Out[6]:
<lxml.etree._ElementTree at 0xb3861f6c>

In [5]:
count = 0
for line in file:
    count+=1
    if count >226450 and count <226490:
        print line


         <skewness horizontal="left-skewed" vertical="top-skewed"/>

         <nbEdges value="8"/>

      </shape>

      <colour BestResponse="Yellow">

         <distribution Blue="4.09799597348e-43" Brown="0.0229247897728" Gray="6.79334259207e-05" Green="4.70763398076e-11" Orange="0.184595789052" Pink="0.00458168931595" Purple="2.1652803925e-38" Red="0.00020111793191" Yellow="0.787628680454"/>

         <hsvValue H="21.7293526786" S="142.527901786" V="170.111049107"/>

         <rgbValue B="76.9402901786" G="144.027901786" R="169.650111607"/>

      </colour>

   </object>

   <object id="6">

      <position global="center top" x="217" y="156"/>

      <shape BestResponse="T">

         <distribution F="2.35518889062e-18" I="2.78348013814e-25" L="8.45398047286e-19" N="1.8479388816e-18" P="1.01760817162e-18" T="0.762004191676" U="7.41836387378e-19" V="0.237995808324" W="1.13722457289e-16" X="9.79843652367e-30" Y="5.05250197758e-19" Z="1.68931090181e-16"/>

         <orientation value="-22.1069923742"/>

         <skewness horizontal="left-skewed" vertical="bottom-skewed"/>

         <nbEdges value="8"/>

      </shape>

      <colour BestResponse="Gray">

         <distribution Blue="6.66936634691e-16" Brown="0.000898578968858" Gray="0.87187745977" Green="0.0427487645079" Orange="0.00424034017922" Pink="0.055378115445" Purple="3.72683858677e-19" Red="0.000151759316217" Yellow="0.0247049818132"/>

         <hsvValue H="45.562692703" S="62.1063720452" V="138.818088386"/>

         <rgbValue B="106.544193217" G="138.790339157" R="121.978931141"/>

      </colour>

   </object>

   <object id="7">

      <position global="right top" x="423" y="99"/>

      <shape BestResponse="F">

         <distribution F="0.857342841268" I="0.0" L="0.0" N="4.47428798782e-33" P="0.109616870989" T="4.30122185409e-11" U="1.11507393156e-18" V="7.25808466893e-18" W="1.09824368171e-16" X="0.0330402877002" Y="3.99216269244e-32" Z="3.43332312339e-33"/>

         <orientation value="5.07458284453"/>

         <skewness horizontal="left-skewed" vertical="symmetric"/>

         <nbEdges value="10"/>

      </shape>

      <colour BestResponse="Gray">

         <distribution Blue="1.09739446488e-14" Brown="2.30835100925e-16" Gray="0.472008028584" Green="0.0201696096035" Orange="4.03326451327e-09" Pink="0.217449010939" Purple="0.290349818132" Red="2.35286677016e-05" Yellow="3.99240554882e-11"/>

         <hsvValue H="114.539834567" S="21.7858075751" V="118.756639094"/>

         <rgbValue B="119.463648237" G="111.680017414" R="109.443622116"/>

      </colour>

   </object>

   <object id="8">

      <position global="center top" x="180" y="75"/>


In [ ]:


In [6]:
dfwords


Out[6]:
endtime gameID position refID starttime uttID word
0 320.583357 r4_1 1 1 320.249396 1 okay
1 323.103442 r4_1 1 1 322.691291 2 ähm
2 323.379596 r4_1 2 1 323.103442 2 also
3 323.635022 r4_1 3 1 323.379596 2 ich
4 323.750636 r4_1 4 1 323.635022 2 hab
5 324.272242 r4_1 5 1 323.828608 2 mir
6 325.772230 r4_1 1 1 325.586710 3 ein
7 326.143270 r4_1 2 1 325.772230 3 Kreuz
8 327.156780 r4_1 3 1 326.143270 3 ausgesucht
9 327.624739 r4_1 4 1 327.377097 3 äh
10 327.944539 r4_1 5 1 327.624739 3 unten
11 328.398927 r4_1 6 1 327.944539 3 rechts
12 328.614557 r4_1 7 1 328.398927 3 das
13 329.011128 r4_1 8 1 328.614557 3 blaue
14 329.754028 r4_1 9 1 329.011128 3 Kreuz
15 332.498644 r4_1 1 1 332.020427 4 das
16 332.877750 r4_1 2 1 332.563173 4 sieht
17 333.079401 r4_1 3 1 332.877750 4 jetzt
18 333.221902 r4_1 4 1 333.079401 4 gut
19 333.606385 r4_1 5 1 333.221902 4 aus
20 335.014355 r4_1 1 1 334.902796 5 soll
21 335.089638 r4_1 2 1 335.014355 5 ich
22 335.164921 r4_1 3 1 335.089638 5 das
23 335.409592 r4_1 4 1 335.164921 5 irgendwie
24 335.987935 r4_1 5 1 335.409592 5 bestätigen
25 336.283691 r4_1 6 1 335.987935 5 oder
26 339.030105 r4_1 1 1 338.604032 6 ja
27 339.161851 r4_1 2 1 339.030105 6 <v=""ist"">is'</v>
28 339.581287 r4_1 3 1 339.161851 6 richtig
29 342.498000 r4_2 1 2 341.828000 7 ähm
... ... ... ... ... ... ... ...
8575 2676.063984 r3_147 7 1032 2675.984825 309 der
8576 2676.320718 r3_147 8 1032 2676.063984 309 Ecke
8577 2684.008789 r3_147 1 1032 2683.609833 310 richtig
8578 2687.547987 r3_148 1 1033 2687.369449 311 das
8579 2687.787140 r3_148 2 1033 2687.547987 311 rote
8580 2688.125481 r3_148 3 1033 2687.787140 311 Kreuz
8581 2688.338184 r3_148 4 1033 2688.125481 311 links
8582 2688.671014 r3_148 5 1033 2688.338184 311 oben
8583 2692.532175 r3_148 1 1033 2692.082086 312 richtig
8584 2725.273291 r3_150 1 1034 2725.102468 313 das
8585 2725.631470 r3_150 2 1034 2725.273291 313 graue
8586 2726.105367 r3_150 3 1034 2725.631470 313 Kreuz
8587 2726.406237 r3_150 4 1034 2726.105367 313 links
8588 2726.560000 r3_150 5 1034 2726.406237 313 oben
8589 2726.738906 r3_150 6 1034 2726.560000 313 oben
8590 2729.850623 r3_150 1 1034 2729.435136 314 richtig
8591 2735.317718 r3_151 1 1035 2734.788716 315 der
8592 2735.676574 r3_151 2 1035 2735.317718 315 grüne
8593 2736.064508 r3_151 3 1035 2735.676574 315 Strich
8594 2736.317988 r3_151 4 1035 2736.064508 315 links
8595 2736.633185 r3_151 5 1035 2736.317988 315 unten
8596 2743.710483 r3_151 1 1035 2743.604683 316 die
8597 2743.988209 r3_151 2 1035 2743.710483 316 grüne
8598 2744.268139 r3_151 3 1035 2743.988209 316 Form
8599 2744.578928 r3_151 4 1035 2744.268139 316 links
8600 2744.757466 r3_151 5 1035 2744.578928 316 unten
8601 2744.905145 r3_151 6 1035 2744.757466 316 in
8602 2744.991108 r3_151 7 1035 2744.905145 316 der
8603 2745.240180 r3_151 8 1035 2744.991108 316 Ecke
8604 2748.164387 r3_151 1 1035 2747.718078 317 richtig

8605 rows × 7 columns


In [ ]: