In [1]:
## import libraries
import pandas as pd
import numpy as np
#allows us to show plots in notebook
%matplotlib inline
In [2]:
filename = 'Data/SHEEF2010.txt' #created a variable that it would be easy to call
sheef = pd.read_csv(filename, sep = '\s+', header=1, index_col='Date')
#made dataframe into variable sheef, deleted unnecessary header, changed the index
sheef
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-2-c90ee10f6ea5> in <module>()
1 filename = 'Data/SHEEF2010.txt' #created a variable that it would be easy to call
2
----> 3 sheef = pd.read_csv(filename, sep = '\s+', header=1, index_col='Date')
4 #made dataframe into variable sheef, deleted unnecessary header, changed the index
5 sheef
/Users/Jessi/anaconda/lib/python3.4/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, float_precision, nrows, iterator, chunksize, verbose, encoding, squeeze, mangle_dupe_cols, tupleize_cols, infer_datetime_format, skip_blank_lines)
489 skip_blank_lines=skip_blank_lines)
490
--> 491 return _read(filepath_or_buffer, kwds)
492
493 parser_f.__name__ = name
/Users/Jessi/anaconda/lib/python3.4/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
266
267 # Create the parser.
--> 268 parser = TextFileReader(filepath_or_buffer, **kwds)
269
270 if (nrows is not None) and (chunksize is not None):
/Users/Jessi/anaconda/lib/python3.4/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
581 self.options['has_index_names'] = kwds['has_index_names']
582
--> 583 self._make_engine(self.engine)
584
585 def _get_options_with_defaults(self, engine):
/Users/Jessi/anaconda/lib/python3.4/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
722 def _make_engine(self, engine='c'):
723 if engine == 'c':
--> 724 self._engine = CParserWrapper(self.f, **self.options)
725 else:
726 if engine == 'python':
/Users/Jessi/anaconda/lib/python3.4/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
1091 kwds['allow_leading_cols'] = self.index_col is not False
1092
-> 1093 self._reader = _parser.TextReader(src, **kwds)
1094
1095 # XXX
pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:3229)()
pandas/parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:6042)()
OSError: File b'Data/SHEEF2010.txt' does not exist
Used the panda option read_csv because it can read txt files.
Used the \s+ because it registers white space as the delimiter and sep = None caused an error showing there was white space.
There was an extra header that had information on the dataframe, but was confusing the system about titles of the columns so I deleted that.
I wanted the dataframe to be sorted by time, so I changed the index.
In [ ]:
In [118]:
sheef_drop = sheef.drop(sheef.columns[5:], axis=1)
#dropped columns with unwanted data
sheef_drop
Out[118]:
MW
Longitude
Latitude
Source
Depth
Date
162701010000
3.4
-70.800
42.600
NCE
0.00
163806111900
6.5
-71.800
44.400
Ebe
0.00
163807010000
2.4
-70.900
42.500
NCE
0.00
163901250000
2.4
-70.950
42.500
NCE
0.00
164303151200
2.9
-70.800
42.800
NCE
0.00
164306111800
2.9
-70.800
42.800
NCE
0.00
165311080000
2.4
-70.900
42.600
NCE
0.00
165804140000
3.1
-70.900
42.500
NCE
0.00
166102101200
5.3
-73.000
45.500
GSC
0.00
166302051730
7.0
-70.100
47.600
GSC
0.00
166302052300
4.0
-70.100
47.600
GSC
0.00
166302061500
4.6
-70.100
47.600
GSC
0.00
166302071400
3.3
-70.100
47.600
GSC
0.00
166311160000
3.3
-70.100
47.600
GSC
0.00
166400000000
3.3
-70.100
47.600
GSC
0.00
166502240000
5.1
-70.000
47.800
GSC
0.00
166510152150
3.3
-71.220
46.820
GSC
0.00
166804030900
2.9
-71.100
42.350
NCE
0.00
166804131300
4.6
-70.500
47.100
GSC
0.00
166812190000
3.1
-71.500
42.500
Ebe
0.00
167202000000
3.3
-69.700
48.150
GSC
0.00
167312080000
2.6
-69.700
48.150
GSC
0.00
167712130000
2.9
-73.500
41.100
NCE
0.00
168502182100
3.1
-70.800
42.800
Ebe
0.00
169702201115
2.6
-70.800
42.000
Ebe
0.00
169801010000
2.9
-73.470
41.380
NCE
0.00
170001270500
9.0
-125.000
48.500
GSC
9.00
170102100000
2.4
-70.950
42.600
NCE
0.00
170103080000
2.4
-70.950
42.600
NCE
0.00
170201010000
2.9
-73.500
41.400
NCE
0.00
...
...
...
...
...
...
201012231238
3.7
-130.268
50.437
PGC
10.00
201012231247
3.7
-130.236
50.467
PGC
10.00
201012232047
3.4
-130.135
50.434
PGC
10.00
201012240307
2.1
-109.200
75.530
GSC
18.00
201012241416
2.2
-67.615
49.307
GSC
1.00
201012260434
3.4
-108.880
76.272
GSC
18.00
201012261944
2.8
-94.326
74.006
GSC
18.00
201012270439
2.8
-96.447
67.083
GSC
18.00
201012270448
2.2
-108.956
76.216
GSC
18.00
201012271500
2.7
-137.874
58.346
PGC
21.01
201012271608
2.6
-75.311
45.745
GSC
15.29
201012271653
3.0
-137.482
58.588
PGC
24.09
201012271708
2.4
-71.100
70.810
GSC
18.00
201012281123
2.9
-130.145
50.280
PGC
10.00
201012281157
3.2
-130.426
50.311
PGC
10.00
201012281217
2.4
-57.508
42.180
GSC
18.00
201012281238
3.4
-130.489
50.661
PGC
10.00
201012281634
2.8
-88.016
64.924
GSC
18.00
201012281857
3.4
-70.859
73.283
GSC
18.00
201012282224
3.2
-133.513
65.229
PGC
1.00
201012290055
3.6
-130.197
50.538
PGC
10.00
201012291415
2.7
-114.560
79.362
GSC
18.00
201012300038
2.0
-72.724
49.482
GSC
18.00
201012300149
4.0
-133.442
65.218
PGC
5.00
201012300622
3.0
-130.282
50.599
PGC
10.00
201012301255
3.8
-85.890
40.430
PDE
4.00
201012301823
3.0
-134.842
54.928
PGC
20.00
201012301921
5.0
-156.149
56.748
PDE
54.00
201012310601
2.9
-53.568
69.760
GSC
18.00
201012310949
3.8
-130.755
50.952
PGC
10.00
41690 rows × 5 columns
In [100]:
sheef['MW'].plot()
#plotted MW column
Out[100]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a72ab38>
In [126]:
sheef_mag = sheef[sheef.MW >= 3.0]
#called only data that matched the restriction MW >= 3.0, which means I only want to see earthquakes that were greater
#than a 3.0 on the Richter Scale
sheef_mag
Out[126]:
MW
Longitude
Latitude
Source
Depth
DF
OM
OMT
MW2
Date
162701010000
3.4
-70.800
42.600
NCE
0.00
3.8
MbLg
3.37
NaN
163806111900
6.5
-71.800
44.400
Ebe
0.00
6.5
MbLg
6.46
NaN
165804140000
3.1
-70.900
42.500
NCE
0.00
3.5
MbLg
3.08
NaN
166102101200
5.3
-73.000
45.500
GSC
0.00
5.7
ML
5.29
NaN
166302051730
7.0
-70.100
47.600
GSC
0.00
7.0
Mw
7.00
NaN
166302052300
4.0
-70.100
47.600
GSC
0.00
4.4
ML
3.99
NaN
166302061500
4.6
-70.100
47.600
GSC
0.00
5.0
ML
4.59
NaN
166302071400
3.3
-70.100
47.600
GSC
0.00
3.7
ML
3.29
NaN
166311160000
3.3
-70.100
47.600
GSC
0.00
3.7
ML
3.29
NaN
166400000000
3.3
-70.100
47.600
GSC
0.00
3.7
ML
3.29
NaN
166502240000
5.1
-70.000
47.800
GSC
0.00
5.5
ML
5.09
NaN
166510152150
3.3
-71.220
46.820
GSC
0.00
3.7
ML
3.29
NaN
166804131300
4.6
-70.500
47.100
GSC
0.00
5.0
ML
4.59
NaN
166812190000
3.1
-71.500
42.500
Ebe
0.00
3.5
MbLg
3.08
NaN
167202000000
3.3
-69.700
48.150
GSC
0.00
3.7
ML
3.29
NaN
168502182100
3.1
-70.800
42.800
Ebe
0.00
3.5
MbLg
3.08
NaN
170001270500
9.0
-125.000
48.500
GSC
9.00
OT
9.00
NaN
NaN
172711100340
4.7
-70.600
42.800
NCE
0.00
5.1
MbLg
4.70
NaN
172711142200
3.1
-70.600
42.800
NCE
0.00
3.5
MbLg
3.08
NaN
172801050300
3.0
-70.600
42.800
NCE
0.00
3.4
MbLg
2.99
NaN
172802102030
3.2
-70.600
42.800
NCE
0.00
3.6
MbLg
3.18
NaN
172902101400
3.1
-70.600
42.800
NCE
0.00
3.5
MbLg
3.08
NaN
173209161600
6.3
-73.600
45.500
GSC
0.00
6.3
Mw
6.30
NaN
173712190345
4.8
-74.000
40.800
NCE
0.00
5.2
MbLg
4.81
NaN
174405270000
3.3
-71.200
46.800
GSC
0.00
3.7
ML
3.29
NaN
174406141515
4.2
-70.900
42.500
NCE
0.00
4.6
MbLg
4.17
NaN
175212172330
3.2
-76.300
40.000
SRA
0.00
3.6
MbLg
3.18
NaN
175511180912
5.5
-70.300
42.700
NCE
0.00
5.8
MbLg
5.53
NaN
175511181029
3.9
-70.300
42.700
NCE
0.00
4.3
MbLg
3.86
NaN
175511230127
3.0
-70.300
42.700
NCE
0.00
3.4
MbLg
2.99
NaN
...
...
...
...
...
...
...
...
...
...
201012150850
3.0
-130.338
50.342
PGC
10.00
F
3.0
ML
3.00
201012161725
3.2
-128.416
48.804
PGC
10.00
F
3.2
Mw
3.20
201012171201
4.3
-151.441
63.328
PDE
16.00
4.3
Mw
4.30
NaN
201012181910
3.1
-129.892
63.713
PGC
10.00
F
3.1
ML
3.10
201012190733
3.5
-130.351
50.619
PGC
10.00
F
3.5
Mw
3.50
201012192104
3.7
-133.462
64.750
PGC
1.00
F
3.7
Mw
3.70
201012192252
3.0
-125.442
49.358
PGC
44.64
3.0
ML
3.00
NaN
201012192302
3.1
-130.258
50.362
PGC
10.00
F
3.1
Mw
3.10
201012200335
3.1
-127.825
49.261
PGC
10.00
F
3.1
Mw
3.10
201012201618
3.1
-130.390
50.538
PGC
10.00
F
3.1
Mw
3.10
201012210445
3.2
-130.296
50.564
PGC
10.00
F
3.2
Mw
3.20
201012211421
3.2
-130.173
50.456
PGC
10.00
F
3.2
Mw
3.20
201012221746
3.6
-130.342
50.643
PGC
10.00
F
3.6
Mw
3.60
201012230917
3.0
-135.278
65.043
PGC
1.00
F
3.0
ML
3.00
201012231238
3.7
-130.268
50.437
PGC
10.00
F
3.7
Mw
3.70
201012231247
3.7
-130.236
50.467
PGC
10.00
F
3.7
Mw
3.70
201012232047
3.4
-130.135
50.434
PGC
10.00
F
3.4
Mw
3.40
201012260434
3.4
-108.880
76.272
GSC
18.00
F
3.9
MN
3.37
201012271653
3.0
-137.482
58.588
PGC
24.09
3.0
ML
3.00
NaN
201012281157
3.2
-130.426
50.311
PGC
10.00
F
3.2
Mw
3.20
201012281238
3.4
-130.489
50.661
PGC
10.00
F
3.4
Mw
3.40
201012281857
3.4
-70.859
73.283
GSC
18.00
F
3.9
ML
3.37
201012282224
3.2
-133.513
65.229
PGC
1.00
F
3.2
ML
3.20
201012290055
3.6
-130.197
50.538
PGC
10.00
F
3.6
Mw
3.60
201012300149
4.0
-133.442
65.218
PGC
5.00
F
4.0
Mw
4.00
201012300622
3.0
-130.282
50.599
PGC
10.00
F
3.0
Mw
3.00
201012301255
3.8
-85.890
40.430
PDE
4.00
3.8
Mw
3.80
NaN
201012301823
3.0
-134.842
54.928
PGC
20.00
F
3.0
ML
3.00
201012301921
5.0
-156.149
56.748
PDE
54.00
5.0
Mw
5.00
NaN
201012310949
3.8
-130.755
50.952
PGC
10.00
F
3.8
Mw
3.80
20809 rows × 9 columns
For my three operations, I chose to drop some columns, create a plot, and sort a dataframe.
1) By dropping the last three columns I got rid of unnecessary data, and streamlined the information we see.
2) By creating a plot I showed an interesting relationship. I needed to have my index as date (which I did when I first loaded the data) so that I could have the x axis as the date. I wanted to see the magnitude of earthquakes over time, which is why I did this. You can see that the amount and magnitude of earthquakes increases over time. Which is actually really interesting, because there could be several reasons for this but primarily it means that something is changing in the region!
3) I wanted to create a new dataframe that showed earthquakes with a magnitude greater than 3. I chose this magnitude because past 3 on the Richter scale is when the earthquake can be 'felt' by people.
In [ ]:
In [ ]:
Content source: jsully1/final_project-1
Similar notebooks: