notebook.community

Edit and run



In [1]:

    
import os
import pandas as pd
!pip install xlrd
!pip install openpyxl









    



Requirement already satisfied (use --upgrade to upgrade): xlrd in /home/mos/anaconda3/lib/python3.5/site-packages
Requirement already satisfied (use --upgrade to upgrade): openpyxl in /home/mos/anaconda3/lib/python3.5/site-packages
Requirement already satisfied (use --upgrade to upgrade): jdcal in /home/mos/anaconda3/lib/python3.5/site-packages (from openpyxl)
Requirement already satisfied (use --upgrade to upgrade): et_xmlfile in /home/mos/anaconda3/lib/python3.5/site-packages (from openpyxl)



In [2]:

    
filename_mapping = pd.read_csv("./original_filenames.txt")
filename_mapping









    Out[2]:






  
    
      
      Old
      New
      Note
    
  
  
    
      0
      Foto Siria Ilario x Wikipedia.xlsx
      GAR_2016-06_no_1.xlxs
      NaN
    
    
      1
      Wiki 10° invio.xlsx
      GAR_2016-06_no_10.xlxs
      NaN
    
    
      2
      Wiki 11° invio.xlsx
      GAR_2016-06_no_11.xlxs
      NaN
    
    
      3
      Wiki 2°invio.xlsx
      GAR_2016-06_no_2.xlxs
      NaN
    
    
      4
      Wiki 3° invio.xlsx
      GAR_2016-06_no_3.xlxs
      NaN
    
    
      5
      Wiki 4° invio.xls
      GAR_2016-06_no_4.xlxs
      NaN
    
    
      6
      Wiki 5° invio.xls
      GAR_2016-06_no_5.xlxs
      NaN
    
    
      7
      Wiki 6° invio.xlsx
      GAR_2016-06_no_6.xlxs
      NaN
    
    
      8
      Wiki 7° invio.ods
      GAR_2016-06_7.xlxs
      Converted .ods to .xlxs first
    
    
      9
      Wiki 8° invio.xlsx
      GAR_2016-06_no_8.xlxs
      NaN
    
    
      10
      Wiki 9° invio.xls
      GAR_2016-06_no_9.xlxs
      NaN



In [7]:

    
excel_extensions = ["xlsx","xls"]



In [16]:

    
filenames = []
for root, dirs, files in os.walk("./"): # current directory
    for file in files:
        for ext in excel_extensions:
            if file.endswith(ext):
                filenames.append(file)



In [17]:

    
filenames









    Out[17]:





['Wiki 7° invio.xlsx',
 'Wiki 2°invio.xlsx',
 'Wiki 5° invio.xls',
 'Wiki 11° invio.xlsx',
 'Wiki 6° invio.xlsx',
 'Foto Siria Ilario x Wikipedia.xlsx',
 'Wiki 10° invio.xlsx',
 'Wiki 3° invio.xlsx',
 'Wiki 4° invio.xls',
 'Wiki 9° invio.xls',
 'Wiki 8° invio.xlsx']



In [18]:

    
df = pd.DataFrame()



In [19]:

    
for file in filenames:
    new_df = pd.read_excel(file, "Foglio1")
    print(new_df.columns)
    df = df.append(new_df)









    



Index(['Nome foto', 'Anno', 'Luogo', 'Nome monumento', 'Descrizione',
       'Nome autore'],
      dtype='object')
Index(['Nome foto', 'Anno', 'Luogo', 'Nome monumento', 'Descrizione',
       'Nome autore'],
      dtype='object')
Index(['Nome foto', 'Anno', 'Luogo', 'Nome monumento', 'Descrizione',
       'Nome autore'],
      dtype='object')
Index(['Nome foto', 'Anno', 'Luogo', 'Nome monumento', 'Descrizione',
       'Nome autore'],
      dtype='object')
Index(['Nome foto', 'Anno', 'Luogo', 'Nome monumento', 'Descrizione',
       'Nome autore'],
      dtype='object')






    



---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-19-44f69f94ecb2> in <module>()
      1 for file in filenames:
----> 2     new_df = pd.read_excel(file, "Foglio1")
      3     print(new_df.columns)
      4     df = df.append(new_df)

/home/mos/anaconda3/lib/python3.5/site-packages/pandas/io/excel.py in read_excel(io, sheetname, header, skiprows, skip_footer, index_col, names, parse_cols, parse_dates, date_parser, na_values, thousands, convert_float, has_index_names, converters, engine, squeeze, **kwds)
    167     """
    168     if not isinstance(io, ExcelFile):
--> 169         io = ExcelFile(io, engine=engine)
    170 
    171     return io._parse_excel(

/home/mos/anaconda3/lib/python3.5/site-packages/pandas/io/excel.py in __init__(self, io, **kwds)
    216                 self.book = xlrd.open_workbook(file_contents=data)
    217             else:
--> 218                 self.book = xlrd.open_workbook(io)
    219         elif engine == 'xlrd' and isinstance(io, xlrd.Book):
    220             self.book = io

/home/mos/anaconda3/lib/python3.5/site-packages/xlrd/__init__.py in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows)
    420                 formatting_info=formatting_info,
    421                 on_demand=on_demand,
--> 422                 ragged_rows=ragged_rows,
    423                 )
    424             return bk

/home/mos/anaconda3/lib/python3.5/site-packages/xlrd/xlsx.py in open_workbook_2007_xml(zf, component_names, logfile, verbosity, use_mmap, formatting_info, on_demand, ragged_rows)
    792         x12sheet = X12Sheet(sheet, logfile, verbosity)
    793         heading = "Sheet %r (sheetx=%d) from %r" % (sheet.name, sheetx, fname)
--> 794         x12sheet.process_stream(zflo, heading)
    795         del zflo
    796         comments_fname = 'xl/comments%d.xml' % (sheetx + 1)

/home/mos/anaconda3/lib/python3.5/site-packages/xlrd/xlsx.py in own_process_stream(self, stream, heading)
    532                 elem.clear() # destroy all child elements (cells)
    533             elif elem.tag == U_SSML12 + "dimension":
--> 534                 self.do_dimension(elem)
    535             elif elem.tag == U_SSML12 + "mergeCell":
    536                 self.do_merge_cell(elem)

/home/mos/anaconda3/lib/python3.5/site-packages/xlrd/xlsx.py in do_dimension(self, elem)
    566             # print >> self.logfile, "dimension: ref=%r" % ref
    567             last_cell_ref = ref.split(':')[-1] # example: "Z99"
--> 568             rowx, colx = cell_name_to_rowx_colx(last_cell_ref)
    569             self.sheet._dimnrows = rowx + 1
    570             self.sheet._dimncols = colx + 1

/home/mos/anaconda3/lib/python3.5/site-packages/xlrd/xlsx.py in cell_name_to_rowx_colx(cell_name, letter_value)
     89             else: # start of row number; can't be '0'
     90                 colx = colx - 1
---> 91                 assert 0 <= colx < X12_MAX_COLS
     92                 break
     93     except KeyError:

AssertionError:



In [20]:

    
df.describe()









    Out[20]:






  
    
      
      Anno
    
  
  
    
      count
      270.000000
    
    
      mean
      2005.044444
    
    
      std
      12.419775
    
    
      min
      2002.000000
    
    
      25%
      2002.000000
    
    
      50%
      2004.000000
    
    
      75%
      2006.000000
    
    
      max
      2204.000000



In [33]:

    
df.to_excel("./GAR_2016-06_metadata.xlsx")



In [ ]:

	Old	New	Note
0	Foto Siria Ilario x Wikipedia.xlsx	GAR_2016-06_no_1.xlxs	NaN
1	Wiki 10° invio.xlsx	GAR_2016-06_no_10.xlxs	NaN
2	Wiki 11° invio.xlsx	GAR_2016-06_no_11.xlxs	NaN
3	Wiki 2°invio.xlsx	GAR_2016-06_no_2.xlxs	NaN
4	Wiki 3° invio.xlsx	GAR_2016-06_no_3.xlxs	NaN
5	Wiki 4° invio.xls	GAR_2016-06_no_4.xlxs	NaN
6	Wiki 5° invio.xls	GAR_2016-06_no_5.xlxs	NaN
7	Wiki 6° invio.xlsx	GAR_2016-06_no_6.xlxs	NaN
8	Wiki 7° invio.ods	GAR_2016-06_7.xlxs	Converted .ods to .xlxs first
9	Wiki 8° invio.xlsx	GAR_2016-06_no_8.xlxs	NaN
10	Wiki 9° invio.xls	GAR_2016-06_no_9.xlxs	NaN

	Anno
count	270.000000
mean	2005.044444
std	12.419775
min	2002.000000
25%	2002.000000
50%	2004.000000
75%	2006.000000
max	2204.000000