In [1]:
import fim

In [2]:
import pandas as pd

In [13]:
df = pd.read_csv('clause-segment-relation.csv', index_col=0)

In [14]:
df.head()


Out[14]:
clause segment relation
0 S satellite cause
1 NP nucleus list
2 S satellite condition
3 S satellite condition
4 S satellite condition

In [22]:
row0 = df.iloc[0] # access the first row
nrows = len(df) # number of rows in a DataFrame

In [58]:
# list of rows. each row is represented as
# a list of column values
tracts = [df.iloc[i].values for i in range(len(df))]

In [84]:
import fim

def dataframe2arules(dataframe, min_support=10, min_confidence=80):
    """extract association rules from a DataFrame
    
    Parameters
    ----------
    min_support : int
        at least n rows have to match the rule (default: 10)
    min_confidence : int
        minimum confidence of an assoc. rule (default: 80%)
    """
    tracts = (dataframe.iloc[i].values
              for i in range(len(dataframe)))
    return fim.arules(tracts, supp=min_support, conf=min_confidence)

In [76]:
dataframe2arules(df)
# 168 satellites
# 127 S-clause satellites
# 60 (S-clause) condition satellites
# 21 circumstance satellites


Out[76]:
[('satellite', (), 168, 84.84848484848484),
 ('satellite', ('S',), 127, 86.39455782312925),
 ('satellite', ('condition',), 60, 93.75),
 ('satellite', ('condition', 'S'), 60, 95.23809523809523),
 ('S', ('condition', 'satellite'), 60, 100.0),
 ('S', ('condition',), 63, 98.4375),
 ('satellite', ('circumstance',), 21, 95.45454545454545)]

In [63]:
dataframe2arules(df[df['clause'] != 'S'])


Out[63]:
[('satellite', (), 41, 80.3921568627451),
 ('satellite', ('VP',), 16, 100.0),
 ('satellite', ('purpose',), 12, 100.0),
 ('satellite', ('purpose', 'VP'), 10, 100.0),
 ('VP', ('purpose', 'satellite'), 10, 83.33333333333334),
 ('VP', ('purpose',), 10, 83.33333333333334),
 ('NP', ('nucleus',), 8, 80.0),
 ('satellite', ('PP',), 8, 88.88888888888889)]

In [77]:
dataframe2arules(df[df['segment'] != 'satellite'])
# 24 nucleii
# 8 NP-clause nucleii


Out[77]:
[('nucleus', (), 24, 80.0),
 ('nucleus', ('NP',), 8, 100.0),
 ('S', ('span',), 6, 100.0),
 ('nucleus', ('e-elaboration',), 4, 100.0),
 ('nucleus', ('e-elaboration', 'NP'), 3, 100.0),
 ('nucleus', ('sequence',), 4, 100.0),
 ('nucleus', ('sequence', 'S'), 3, 100.0),
 ('nucleus', ('list',), 4, 100.0),
 ('nucleus', ('elaboration',), 3, 100.0),
 ('nucleus', ('elaboration', 'S'), 3, 100.0),
 ('S', ('elaboration', 'nucleus'), 3, 100.0),
 ('S', ('elaboration',), 3, 100.0),
 ('nucleus', ('contrast',), 3, 100.0),
 ('nucleus', ('contrast', 'S'), 3, 100.0),
 ('S', ('contrast', 'nucleus'), 3, 100.0),
 ('S', ('contrast',), 3, 100.0)]

In [83]:
dataframe2arules(df[df['relation'] != 'condition'])


Out[83]:
[('satellite', (), 108, 80.59701492537313),
 ('satellite', ('circumstance',), 21, 95.45454545454545),
 ('satellite', ('circumstance', 'S'), 16, 94.11764705882352),
 ('satellite', ('cause',), 17, 89.47368421052632),
 ('satellite', ('cause', 'S'), 15, 93.75),
 ('S', ('cause', 'satellite'), 15, 88.23529411764706),
 ('S', ('cause',), 16, 84.21052631578947),
 ('satellite', ('purpose',), 16, 100.0),
 ('satellite', ('VP',), 16, 100.0)]

In [ ]: