In [5]:
# import numpy and pandas, and DataFrame / Series
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
# Set some pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 100)
pd.set_option('max_colwidth',256)
pd.set_option('display.width', 80)
pd.set_option('display.max_rows', 10)

In [6]:
num=pd.read_csv("c:\\xbrl\\2009q2\\num.txt", sep='\t', header=0)
sub=pd.read_csv("c:\\xbrl\\2009q2\\sub.txt", sep='\t', header=0)
pre=pd.read_csv("c:\\xbrl\\2009q2\\pre.txt", sep='\t', header=0)
tag=pd.read_csv("c:\\xbrl\\2009q2\\tag.txt", sep='\t', header=0)

In [52]:
num_sub=num.merge(sub,'left','adsh')
num_sub_pre=num_sub.merge(pre,'left',['adsh','tag','version'])
num_sub.info()
print('\n')
num_sub_pre.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 4930 entries, 0 to 4929
Data columns (total 44 columns):
adsh          4930 non-null object
tag           4930 non-null object
version       4930 non-null object
coreg         0 non-null float64
ddate         4930 non-null int64
qtrs          4930 non-null int64
uom           4930 non-null object
value         4919 non-null float64
footnote      8 non-null object
cik           4930 non-null int64
name          4930 non-null object
sic           4930 non-null int64
countryba     4930 non-null object
stprba        4761 non-null object
cityba        4930 non-null object
zipba         4930 non-null object
bas1          4930 non-null object
bas2          1403 non-null object
baph          4761 non-null object
countryma     4930 non-null object
stprma        4761 non-null object
cityma        4930 non-null object
zipma         4930 non-null object
mas1          4930 non-null object
mas2          1238 non-null object
countryinc    4729 non-null object
stprinc       4560 non-null object
ein           4930 non-null int64
former        2232 non-null object
changed       2232 non-null float64
afs           4930 non-null object
wksi          4930 non-null int64
fye           4930 non-null int64
form          4930 non-null object
period        4930 non-null int64
fy            4930 non-null int64
fp            4930 non-null object
filed         4930 non-null int64
accepted      4930 non-null object
prevrpt       4930 non-null int64
detail        4930 non-null int64
instance      4930 non-null object
nciks         4930 non-null int64
aciks         240 non-null object
dtypes: float64(3), int64(13), object(28)
memory usage: 1.7+ MB


<class 'pandas.core.frame.DataFrame'>
Int64Index: 5319 entries, 0 to 5318
Data columns (total 51 columns):
adsh          5319 non-null object
tag           5319 non-null object
version       5319 non-null object
coreg         0 non-null float64
ddate         5319 non-null int64
qtrs          5319 non-null int64
uom           5319 non-null object
value         5308 non-null float64
footnote      10 non-null object
cik           5319 non-null int64
name          5319 non-null object
sic           5319 non-null int64
countryba     5319 non-null object
stprba        5140 non-null object
cityba        5319 non-null object
zipba         5319 non-null object
bas1          5319 non-null object
bas2          1471 non-null object
baph          5140 non-null object
countryma     5319 non-null object
stprma        5140 non-null object
cityma        5319 non-null object
zipma         5319 non-null object
mas1          5319 non-null object
mas2          1294 non-null object
countryinc    5088 non-null object
stprinc       4909 non-null object
ein           5319 non-null int64
former        2390 non-null object
changed       2390 non-null float64
afs           5319 non-null object
wksi          5319 non-null int64
fye           5319 non-null int64
form          5319 non-null object
period        5319 non-null int64
fy            5319 non-null int64
fp            5319 non-null object
filed         5319 non-null int64
accepted      5319 non-null object
prevrpt       5319 non-null int64
detail        5319 non-null int64
instance      5319 non-null object
nciks         5319 non-null int64
aciks         266 non-null object
report        4737 non-null float64
line          4737 non-null float64
stmt          4737 non-null object
inpth         4737 non-null float64
rfile         4737 non-null object
plabel        4737 non-null object
negating      4737 non-null float64
dtypes: float64(7), int64(13), object(31)
memory usage: 2.1+ MB

In [47]:
num_dup=num[num.duplicated(['adsh','tag','version'],keep=False)][['adsh','tag','version','ddate','value']]

In [48]:
pre_dup=pre[pre.duplicated(['adsh','tag','version'],keep=False)][['adsh','tag','version','stmt','line']]

In [49]:
dup=num_dup.merge(pre_dup,'inner',['adsh','tag','version'])

In [42]:
dup.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 636 entries, 0 to 635
Data columns (total 6 columns):
adsh       636 non-null object
tag        636 non-null object
version    636 non-null object
value      636 non-null float64
stmt       636 non-null object
line       636 non-null int64
dtypes: float64(1), int64(1), object(4)
memory usage: 34.8+ KB

In [36]:
pd.set_option('display.max_rows', 650)
pd.set_option('display.width', 256)

In [50]:
dup.to_csv('dup.txt',sep='\t',header=0,index=False)

In [16]:
print(sub.iloc[:, :6].to_string())


                    adsh      cik                                name   sic countryba stprba
0   0000038074-09-000029    38074             FOREST LABORATORIES INC  2834        US     NY
1   0000092380-09-000018    92380               SOUTHWEST AIRLINES CO  4512        US     TX
2   0000277948-09-000066   277948                            CSX CORP  4011        US     FL
3   0000796343-09-000026   796343                   ADOBE SYSTEMS INC  7372        US     CA
4   0000831259-09-000047   831259  FREEPORT MCMORAN COPPER & GOLD INC  1000        US     AZ
5   0000891618-09-000150  1164727            NEWMONT MINING CORP /DE/  1040        US     CO
6   0000891618-09-000166    93410                        CHEVRON CORP  2911        US     CA
7   0000950123-09-008550    13610                      BOWNE & CO INC  2750        US     NY
8   0000950134-09-009913  1349436                SANDRIDGE ENERGY INC  1311        US     OK
9   0001002638-09-000023  1002638                      OPEN TEXT CORP  7373        CA     ON
10  0001031296-09-000011  1031296                    FIRSTENERGY CORP  4911        US     OH
11  0001104659-09-025790   883984                  ICU MEDICAL INC/DE  3841        US     CA
12  0001104659-09-029605   901491        PAPA JOHNS INTERNATIONAL INC  5812        US     KY
13  0001104659-09-031513  1124198                          FLUOR CORP  1600        US     TX
14  0001193125-09-088631  1335793                        CNX GAS CORP  1311        US     PA
15  0001193125-09-088639  1070412                   CONSOL ENERGY INC  1221        US     PA
16  0001193125-09-091929   884905                         PRAXAIR INC  2810        US     CT
17  0001193125-09-094508   108772                          XEROX CORP  3577        US     CT
18  0001193125-09-096522  1365135                    WESTERN UNION CO  7389        US     CO
19  0001193125-09-100014  1316631                LIBERTY GLOBAL, INC.  4841        US     CO
20  0001193125-09-101273    86144                         SAFEWAY INC  5411        US     CA
21  0001193125-09-101639    29669              RR DONNELLEY & SONS CO  2750        US     IL
22  0001193125-09-102866   818686  TEVA PHARMACEUTICAL INDUSTRIES LTD  2834        IL    NaN
23  0001193125-09-109617  1368007                       NYSE EURONEXT  6200        US     NY
24  0001193125-09-111401  1080224                    EDGAR ONLINE INC  7389        US     CT
25  0001193125-09-124260   769397                        AUTODESK INC  7372        US     CA

In [3]:
pd.set_option('display.max_rows', 100)
num.info()
print('\n')
sub.info()
print('\n')
pre.info()
print('\n')
tag.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4930 entries, 0 to 4929
Data columns (total 9 columns):
adsh        4930 non-null object
tag         4930 non-null object
version     4930 non-null object
coreg       0 non-null float64
ddate       4930 non-null int64
qtrs        4930 non-null int64
uom         4930 non-null object
value       4919 non-null float64
footnote    8 non-null object
dtypes: float64(2), int64(2), object(5)
memory usage: 346.7+ KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 36 columns):
adsh          26 non-null object
cik           26 non-null int64
name          26 non-null object
sic           26 non-null int64
countryba     26 non-null object
stprba        25 non-null object
cityba        26 non-null object
zipba         26 non-null object
bas1          26 non-null object
bas2          8 non-null object
baph          25 non-null object
countryma     26 non-null object
stprma        25 non-null object
cityma        26 non-null object
zipma         26 non-null object
mas1          26 non-null object
mas2          7 non-null object
countryinc    25 non-null object
stprinc       24 non-null object
ein           26 non-null int64
former        12 non-null object
changed       12 non-null float64
afs           26 non-null object
wksi          26 non-null int64
fye           26 non-null int64
form          26 non-null object
period        26 non-null int64
fy            26 non-null int64
fp            26 non-null object
filed         26 non-null int64
accepted      26 non-null object
prevrpt       26 non-null int64
detail        26 non-null int64
instance      26 non-null object
nciks         26 non-null int64
aciks         1 non-null object
dtypes: float64(1), int64(11), object(24)
memory usage: 7.4+ KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3178 entries, 0 to 3177
Data columns (total 10 columns):
adsh        3178 non-null object
report      3178 non-null int64
line        3178 non-null int64
stmt        3178 non-null object
inpth       3178 non-null int64
rfile       3178 non-null object
tag         3178 non-null object
version     3178 non-null object
plabel      3178 non-null object
negating    3178 non-null int64
dtypes: int64(4), object(6)
memory usage: 248.4+ KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 919 entries, 0 to 918
Data columns (total 9 columns):
tag         919 non-null object
version     919 non-null object
custom      919 non-null int64
abstract    919 non-null int64
datatype    752 non-null object
iord        739 non-null object
crdr        592 non-null object
tlabel      919 non-null object
doc         789 non-null object
dtypes: int64(2), object(7)
memory usage: 64.7+ KB

In [ ]:
pre[pre.duplicated(['tag','version','adsh'])-pre.duplicated(['tag','version','adsh','stmt'])]

In [ ]:
num[(num.adsh=='0000950123-09-008550')&(num.tag=='AmortizationOfIntangibleAssets')]

In [ ]:
pre[(pre.adsh=='0000950123-09-008550')&(pre.tag=='AmortizationOfIntangibleAssets')]

In [ ]:
mrg[(mrg.adsh=='0000950123-09-008550')&(mrg.tag=='AmortizationOfIntangibleAssets')]

In [ ]:
mrg=num.merge(pre, 'left',['adsh','tag','version'])
mrg[mrg.duplicated(['line','value','stmt'],keep=False)].info()

In [ ]:
5319-4930

In [ ]: