In [1]:
import os
import pandas as pd
!pip install xlrd
!pip install openpyxl


Requirement already satisfied (use --upgrade to upgrade): xlrd in /home/mos/anaconda3/lib/python3.5/site-packages
Requirement already satisfied (use --upgrade to upgrade): openpyxl in /home/mos/anaconda3/lib/python3.5/site-packages
Requirement already satisfied (use --upgrade to upgrade): jdcal in /home/mos/anaconda3/lib/python3.5/site-packages (from openpyxl)
Requirement already satisfied (use --upgrade to upgrade): et_xmlfile in /home/mos/anaconda3/lib/python3.5/site-packages (from openpyxl)

In [2]:
filename_mapping = pd.read_csv("./original_filenames.txt")
filename_mapping


Out[2]:
Old New Note
0 Foto Siria Ilario x Wikipedia.xlsx GAR_2016-06_no_1.xlxs NaN
1 Wiki 10° invio.xlsx GAR_2016-06_no_10.xlxs NaN
2 Wiki 11° invio.xlsx GAR_2016-06_no_11.xlxs NaN
3 Wiki 2°invio.xlsx GAR_2016-06_no_2.xlxs NaN
4 Wiki 3° invio.xlsx GAR_2016-06_no_3.xlxs NaN
5 Wiki 4° invio.xls GAR_2016-06_no_4.xlxs NaN
6 Wiki 5° invio.xls GAR_2016-06_no_5.xlxs NaN
7 Wiki 6° invio.xlsx GAR_2016-06_no_6.xlxs NaN
8 Wiki 7° invio.ods GAR_2016-06_7.xlxs Converted .ods to .xlxs first
9 Wiki 8° invio.xlsx GAR_2016-06_no_8.xlxs NaN
10 Wiki 9° invio.xls GAR_2016-06_no_9.xlxs NaN

In [7]:
excel_extensions = ["xlsx","xls"]

In [16]:
filenames = []
for root, dirs, files in os.walk("./"): # current directory
    for file in files:
        for ext in excel_extensions:
            if file.endswith(ext):
                filenames.append(file)

In [17]:
filenames


Out[17]:
['Wiki 7° invio.xlsx',
 'Wiki 2°invio.xlsx',
 'Wiki 5° invio.xls',
 'Wiki 11° invio.xlsx',
 'Wiki 6° invio.xlsx',
 'Foto Siria Ilario x Wikipedia.xlsx',
 'Wiki 10° invio.xlsx',
 'Wiki 3° invio.xlsx',
 'Wiki 4° invio.xls',
 'Wiki 9° invio.xls',
 'Wiki 8° invio.xlsx']

In [18]:
df = pd.DataFrame()

In [19]:
for file in filenames:
    new_df = pd.read_excel(file, "Foglio1")
    print(new_df.columns)
    df = df.append(new_df)


Index(['Nome foto', 'Anno', 'Luogo', 'Nome monumento', 'Descrizione',
       'Nome autore'],
      dtype='object')
Index(['Nome foto', 'Anno', 'Luogo', 'Nome monumento', 'Descrizione',
       'Nome autore'],
      dtype='object')
Index(['Nome foto', 'Anno', 'Luogo', 'Nome monumento', 'Descrizione',
       'Nome autore'],
      dtype='object')
Index(['Nome foto', 'Anno', 'Luogo', 'Nome monumento', 'Descrizione',
       'Nome autore'],
      dtype='object')
Index(['Nome foto', 'Anno', 'Luogo', 'Nome monumento', 'Descrizione',
       'Nome autore'],
      dtype='object')
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-19-44f69f94ecb2> in <module>()
      1 for file in filenames:
----> 2     new_df = pd.read_excel(file, "Foglio1")
      3     print(new_df.columns)
      4     df = df.append(new_df)

/home/mos/anaconda3/lib/python3.5/site-packages/pandas/io/excel.py in read_excel(io, sheetname, header, skiprows, skip_footer, index_col, names, parse_cols, parse_dates, date_parser, na_values, thousands, convert_float, has_index_names, converters, engine, squeeze, **kwds)
    167     """
    168     if not isinstance(io, ExcelFile):
--> 169         io = ExcelFile(io, engine=engine)
    170 
    171     return io._parse_excel(

/home/mos/anaconda3/lib/python3.5/site-packages/pandas/io/excel.py in __init__(self, io, **kwds)
    216                 self.book = xlrd.open_workbook(file_contents=data)
    217             else:
--> 218                 self.book = xlrd.open_workbook(io)
    219         elif engine == 'xlrd' and isinstance(io, xlrd.Book):
    220             self.book = io

/home/mos/anaconda3/lib/python3.5/site-packages/xlrd/__init__.py in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows)
    420                 formatting_info=formatting_info,
    421                 on_demand=on_demand,
--> 422                 ragged_rows=ragged_rows,
    423                 )
    424             return bk

/home/mos/anaconda3/lib/python3.5/site-packages/xlrd/xlsx.py in open_workbook_2007_xml(zf, component_names, logfile, verbosity, use_mmap, formatting_info, on_demand, ragged_rows)
    792         x12sheet = X12Sheet(sheet, logfile, verbosity)
    793         heading = "Sheet %r (sheetx=%d) from %r" % (sheet.name, sheetx, fname)
--> 794         x12sheet.process_stream(zflo, heading)
    795         del zflo
    796         comments_fname = 'xl/comments%d.xml' % (sheetx + 1)

/home/mos/anaconda3/lib/python3.5/site-packages/xlrd/xlsx.py in own_process_stream(self, stream, heading)
    532                 elem.clear() # destroy all child elements (cells)
    533             elif elem.tag == U_SSML12 + "dimension":
--> 534                 self.do_dimension(elem)
    535             elif elem.tag == U_SSML12 + "mergeCell":
    536                 self.do_merge_cell(elem)

/home/mos/anaconda3/lib/python3.5/site-packages/xlrd/xlsx.py in do_dimension(self, elem)
    566             # print >> self.logfile, "dimension: ref=%r" % ref
    567             last_cell_ref = ref.split(':')[-1] # example: "Z99"
--> 568             rowx, colx = cell_name_to_rowx_colx(last_cell_ref)
    569             self.sheet._dimnrows = rowx + 1
    570             self.sheet._dimncols = colx + 1

/home/mos/anaconda3/lib/python3.5/site-packages/xlrd/xlsx.py in cell_name_to_rowx_colx(cell_name, letter_value)
     89             else: # start of row number; can't be '0'
     90                 colx = colx - 1
---> 91                 assert 0 <= colx < X12_MAX_COLS
     92                 break
     93     except KeyError:

AssertionError: 

In [20]:
df.describe()


Out[20]:
Anno
count 270.000000
mean 2005.044444
std 12.419775
min 2002.000000
25% 2002.000000
50% 2004.000000
75% 2006.000000
max 2204.000000

In [33]:
df.to_excel("./GAR_2016-06_metadata.xlsx")

In [ ]: