In [1]:
import os
import pandas as pd
!pip install xlrd
!pip install openpyxl
Requirement already satisfied (use --upgrade to upgrade): xlrd in /home/mos/anaconda3/lib/python3.5/site-packages
Requirement already satisfied (use --upgrade to upgrade): openpyxl in /home/mos/anaconda3/lib/python3.5/site-packages
Requirement already satisfied (use --upgrade to upgrade): jdcal in /home/mos/anaconda3/lib/python3.5/site-packages (from openpyxl)
Requirement already satisfied (use --upgrade to upgrade): et_xmlfile in /home/mos/anaconda3/lib/python3.5/site-packages (from openpyxl)
In [2]:
filename_mapping = pd.read_csv("./original_filenames.txt")
filename_mapping
Out[2]:
Old
New
Note
0
Foto Siria Ilario x Wikipedia.xlsx
GAR_2016-06_no_1.xlxs
NaN
1
Wiki 10° invio.xlsx
GAR_2016-06_no_10.xlxs
NaN
2
Wiki 11° invio.xlsx
GAR_2016-06_no_11.xlxs
NaN
3
Wiki 2°invio.xlsx
GAR_2016-06_no_2.xlxs
NaN
4
Wiki 3° invio.xlsx
GAR_2016-06_no_3.xlxs
NaN
5
Wiki 4° invio.xls
GAR_2016-06_no_4.xlxs
NaN
6
Wiki 5° invio.xls
GAR_2016-06_no_5.xlxs
NaN
7
Wiki 6° invio.xlsx
GAR_2016-06_no_6.xlxs
NaN
8
Wiki 7° invio.ods
GAR_2016-06_7.xlxs
Converted .ods to .xlxs first
9
Wiki 8° invio.xlsx
GAR_2016-06_no_8.xlxs
NaN
10
Wiki 9° invio.xls
GAR_2016-06_no_9.xlxs
NaN
In [7]:
excel_extensions = ["xlsx","xls"]
In [16]:
filenames = []
for root, dirs, files in os.walk("./"): # current directory
for file in files:
for ext in excel_extensions:
if file.endswith(ext):
filenames.append(file)
In [17]:
filenames
Out[17]:
['Wiki 7° invio.xlsx',
'Wiki 2°invio.xlsx',
'Wiki 5° invio.xls',
'Wiki 11° invio.xlsx',
'Wiki 6° invio.xlsx',
'Foto Siria Ilario x Wikipedia.xlsx',
'Wiki 10° invio.xlsx',
'Wiki 3° invio.xlsx',
'Wiki 4° invio.xls',
'Wiki 9° invio.xls',
'Wiki 8° invio.xlsx']
In [18]:
df = pd.DataFrame()
In [19]:
for file in filenames:
new_df = pd.read_excel(file, "Foglio1")
print(new_df.columns)
df = df.append(new_df)
Index(['Nome foto', 'Anno', 'Luogo', 'Nome monumento', 'Descrizione',
'Nome autore'],
dtype='object')
Index(['Nome foto', 'Anno', 'Luogo', 'Nome monumento', 'Descrizione',
'Nome autore'],
dtype='object')
Index(['Nome foto', 'Anno', 'Luogo', 'Nome monumento', 'Descrizione',
'Nome autore'],
dtype='object')
Index(['Nome foto', 'Anno', 'Luogo', 'Nome monumento', 'Descrizione',
'Nome autore'],
dtype='object')
Index(['Nome foto', 'Anno', 'Luogo', 'Nome monumento', 'Descrizione',
'Nome autore'],
dtype='object')
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-19-44f69f94ecb2> in <module>()
1 for file in filenames:
----> 2 new_df = pd.read_excel(file, "Foglio1")
3 print(new_df.columns)
4 df = df.append(new_df)
/home/mos/anaconda3/lib/python3.5/site-packages/pandas/io/excel.py in read_excel(io, sheetname, header, skiprows, skip_footer, index_col, names, parse_cols, parse_dates, date_parser, na_values, thousands, convert_float, has_index_names, converters, engine, squeeze, **kwds)
167 """
168 if not isinstance(io, ExcelFile):
--> 169 io = ExcelFile(io, engine=engine)
170
171 return io._parse_excel(
/home/mos/anaconda3/lib/python3.5/site-packages/pandas/io/excel.py in __init__(self, io, **kwds)
216 self.book = xlrd.open_workbook(file_contents=data)
217 else:
--> 218 self.book = xlrd.open_workbook(io)
219 elif engine == 'xlrd' and isinstance(io, xlrd.Book):
220 self.book = io
/home/mos/anaconda3/lib/python3.5/site-packages/xlrd/__init__.py in open_workbook(filename, logfile, verbosity, use_mmap, file_contents, encoding_override, formatting_info, on_demand, ragged_rows)
420 formatting_info=formatting_info,
421 on_demand=on_demand,
--> 422 ragged_rows=ragged_rows,
423 )
424 return bk
/home/mos/anaconda3/lib/python3.5/site-packages/xlrd/xlsx.py in open_workbook_2007_xml(zf, component_names, logfile, verbosity, use_mmap, formatting_info, on_demand, ragged_rows)
792 x12sheet = X12Sheet(sheet, logfile, verbosity)
793 heading = "Sheet %r (sheetx=%d) from %r" % (sheet.name, sheetx, fname)
--> 794 x12sheet.process_stream(zflo, heading)
795 del zflo
796 comments_fname = 'xl/comments%d.xml' % (sheetx + 1)
/home/mos/anaconda3/lib/python3.5/site-packages/xlrd/xlsx.py in own_process_stream(self, stream, heading)
532 elem.clear() # destroy all child elements (cells)
533 elif elem.tag == U_SSML12 + "dimension":
--> 534 self.do_dimension(elem)
535 elif elem.tag == U_SSML12 + "mergeCell":
536 self.do_merge_cell(elem)
/home/mos/anaconda3/lib/python3.5/site-packages/xlrd/xlsx.py in do_dimension(self, elem)
566 # print >> self.logfile, "dimension: ref=%r" % ref
567 last_cell_ref = ref.split(':')[-1] # example: "Z99"
--> 568 rowx, colx = cell_name_to_rowx_colx(last_cell_ref)
569 self.sheet._dimnrows = rowx + 1
570 self.sheet._dimncols = colx + 1
/home/mos/anaconda3/lib/python3.5/site-packages/xlrd/xlsx.py in cell_name_to_rowx_colx(cell_name, letter_value)
89 else: # start of row number; can't be '0'
90 colx = colx - 1
---> 91 assert 0 <= colx < X12_MAX_COLS
92 break
93 except KeyError:
AssertionError:
In [20]:
df.describe()
Out[20]:
Anno
count
270.000000
mean
2005.044444
std
12.419775
min
2002.000000
25%
2002.000000
50%
2004.000000
75%
2006.000000
max
2204.000000
In [33]:
df.to_excel("./GAR_2016-06_metadata.xlsx")
In [ ]:
Content source: mattiasostmar/GAR_Syria_2016-06
Similar notebooks: