In [20]:
from nltk import word_tokenize
from nltk.misc import wordfinder

In [21]:
from nltk.corpus import wordnet as wn

In [22]:
print wn.synsets('ORG')
print wn.synsets('CONTACT')
print wn.synsets('EMAIL')


[]
[Synset('contact.n.01'), Synset('contact.n.02'), Synset('contact.n.03'), Synset('contact.n.04'), Synset('contact.n.05'), Synset('liaison.n.02'), Synset('contact.n.07'), Synset('contact.n.08'), Synset('contact.n.09'), Synset('reach.v.04'), Synset('touch.v.05')]
[Synset('electronic_mail.n.01'), Synset('e-mail.v.01')]

In [23]:
word_tokenize('REFERENCENUMBER')


Out[23]:
['REFERENCENUMBER']

In [24]:
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()
lem.lemmatize('reference')


Out[24]:
'reference'

In [54]:
def gen_pairs(string, stop=1):
    sz = len(string)
    if sz == 1:
        return
    half = sz / 2
    for i in range(half, sz-stop):
        yield string[:i], string[i:]
    for i in range(half, sz-stop):
        yield string[:-i], string[-i:]
    
print list( gen_pairs('contractemail') )
print list( gen_pairs('noticeid') )
print list( gen_pairs('org') )


[('contra', 'ctemail'), ('contrac', 'temail'), ('contract', 'email'), ('contracte', 'mail'), ('contractem', 'ail'), ('contractema', 'il'), ('contrac', 'temail'), ('contra', 'ctemail'), ('contr', 'actemail'), ('cont', 'ractemail'), ('con', 'tractemail'), ('co', 'ntractemail')]
[('noti', 'ceid'), ('notic', 'eid'), ('notice', 'id'), ('noti', 'ceid'), ('not', 'iceid'), ('no', 'ticeid')]
[('o', 'rg'), ('or', 'g')]

In [83]:
import re

def is_word(string):
    if wn.synsets(string):
        return True
    
def find_two_words(string):
    best = None
    best_score = 0
    for left, right in gen_pairs(string):
        score = 0
        if is_word(left):
            score += .5
        if is_word(right):
            score += .5
        if score > best_score:
            best = left, right
            best_score = score
    return best
    
def split_words(string):
    """
    Spits a string into multiple words.
    This assumes any string of letters is only at max two words.
    
    If there is a space in it, we assume it's already split and is all nice.
    """
    string = string.strip().lower()
    
    if ' ' in string:
        for obj in string.split():
            yield obj.lower()
    
    subs = re.split('([^a-z]+)', string.lower())
    for sub in subs:
        
        ## If it's not a word, well we don't care about it.
        if not re.match('[a-z]+', sub):
            continue
        
        ## If it is a word, yay!
        if is_word(sub):
            yield sub
        
        ## If it's not a word, but it's 3 or less letters it's probably not two words, so just yield that.
        elif len(sub) <= 3:
            yield sub
        
        ## Otherwise let's try to split it into two words
        else:
            pair = find_two_words(sub)
            if pair:
                yield pair[0]
                yield pair[1]
            
            ## No dice, okay well let's just yield it
            else:
                yield sub
    
    
def cut(string, half):
    sz = len(string)
    if sz == 1:
        return False
    half = sz / 2
    left = string[:half]
    right = string[half:]
    return left, right
    

#list( split_words("noticeid") )
list( split_words("contract_sequence_number") )


Out[83]:
['contract', 'sequence', 'number']

In [75]:
headers = [
    '=== UK ===',
    'NOTICEID',
    'REFERENCENUMBER',
    'DATEPUBLISHED',
    'VALUEMIN',
    'VALUEMAX',
    'STATUS',
    'URL',
    'ORG_NAME',
    'ORG_CONTACTEMAIL',
    'TITLE',
    'DESCRIPTION',
    'NOTICETYPE',
    'REGION',
    'NOTICE_STATE',
    'NOTICE_STATE_CHANGE_DATE',
    'CLASSIFICATION',
    'NUM_DOCS',
    
    '=== Canada ===',
    'language',
    'procurement_entity_name',
    'title',
    'reference_number',
    'solicitation_number',
    'contract_sequence_number',
    'contract_number',
    'publishing_status',
    'award_date',
    'publication_date',
    'amendment_date',
    'gsin',
    'contract_award_procedure',
    'tendering_procedure',
    'procurement_entity',
    'end_user_entity',
    'customer_info',
    'description',
    'supplier_info',
    'currency',
    'contract_value',
    
    '=== Mexico ===',
    'GOVERNMENT',
    'ACRONYMS',
    'UNIT',
    'CompraNet Unit Identifier',
    'CompraNet Unit name',
    'Responsibility',
    'CompraNet Record ID',
    'Record Title ',
    'Record template',
    'Procedure Number',
    'Decision Date',
    'RFP Date Type',
    'Proposal Opening Date',
    'Procedure scope',
    'Contract Type',
    'Procedure type',
    'Submission form type',
    'Contract ID',
    'Contract Title',
    'Contract start date',
    'contract end date',
    'contract value',
    'Currency type',
    'contract status',
    'archive status',
    'Unit Branch',
    'Program Key',
    'Federal contribution',
    'Signature date',
    'Contract Framework',
    'Consolidated purchase',
    'Multi-year',
    'Unit stratification',
    'Contractor Legal Name',
    'Contractor stratification',
    'Company state',
    'Account manager type',
    'announcement URL',
    
    '=== Columbia ===',
    'Sector Code',
    'Name of the Sector',
    'Executing Unit Code',
    'Name of the unit',
    'Budget Type',
    'Budget Category',
    'Name of the budget category',
    'Funding Source',
    'Name of the Budget',
    'Budget Status',
    'Month',
    'Initial Appropriation',
    'Current Appropiation',
    'Value of the Engagement',
    'Value of the obligations',
    'Total Value of the Payment order',
    'Engaged Percentage',
    'Mandatory percentage',
    'Payed percentage',
    
    '=== Georgia ===',
    'id',
    'procurring_entity_id',
    'tender_type',
    'tender_registration_number',
    'tender_status',
    'tender_announcement_date',
    'bid_start_date',
    'bid_end_date',
    'estimated_value',
    'include_vat',
    'cpv_code',
    'offer_step',
    'guarantee_amount',
    'guarantee_period',
    'created_at',
    'updated_at',
    'dataset_id',
    'url_id',
    'num_bids',
    'num_bidders',
    'contract_value',
    'winning_org_id',
    'risk_indicators',
    'procurer_name',
    'supplier_name',
    'sub_codes',
    'inProgress',
    'updated',
    'is_new',
    'procurer_code',
    'winner_code',
]

for header in headers:
    if header.startswith('='):
        print "\n", header
        continue
    print header, list( split_words(header) )


=== UK ===
NOTICEID ['notice', 'id']
REFERENCENUMBER ['reference', 'number']
DATEPUBLISHED ['date', 'published']
VALUEMIN ['value', 'min']
VALUEMAX ['value', 'max']
STATUS ['status']
URL ['url']
ORG_NAME ['org', 'name']
ORG_CONTACTEMAIL ['org', 'contact', 'email']
TITLE ['title']
DESCRIPTION ['description']
NOTICETYPE ['notice', 'type']
REGION ['region']
NOTICE_STATE ['notice', 'state']
NOTICE_STATE_CHANGE_DATE ['notice', 'state', 'change', 'date']
CLASSIFICATION ['classification']
NUM_DOCS ['num', 'docs']

=== Canada ===
language ['language']
procurement_entity_name ['procurement', 'entity', 'name']
title ['title']
reference_number ['reference', 'number']
solicitation_number ['solicitation', 'number']
contract_sequence_number ['contract', 'sequence', 'number']
contract_number ['contract', 'number']
publishing_status ['publishing', 'status']
award_date ['award', 'date']
publication_date ['publication', 'date']
amendment_date ['amendment', 'date']
gsin ['gs', 'in']
contract_award_procedure ['contract', 'award', 'procedure']
tendering_procedure ['tendering', 'procedure']
procurement_entity ['procurement', 'entity']
end_user_entity ['end', 'user', 'entity']
customer_info ['customer', 'info']
description ['description']
supplier_info ['supplier', 'info']
currency ['currency']
contract_value ['contract', 'value']

=== Mexico ===
GOVERNMENT ['government']
ACRONYMS ['acronyms']
UNIT ['unit']
CompraNet Unit Identifier ['compranet', 'unit', 'identifier', 'comp', 'ranet', 'unit', 'identifier']
CompraNet Unit name ['compranet', 'unit', 'name', 'comp', 'ranet', 'unit', 'name']
Responsibility ['responsibility']
CompraNet Record ID ['compranet', 'record', 'id', 'comp', 'ranet', 'record', 'id']
Record Title  ['record', 'title', 'record', 'title']
Record template ['record', 'template', 'record', 'template']
Procedure Number ['procedure', 'number', 'procedure', 'number']
Decision Date ['decision', 'date', 'decision', 'date']
RFP Date Type ['rfp', 'date', 'type', 'rfp', 'date', 'type']
Proposal Opening Date ['proposal', 'opening', 'date', 'proposal', 'opening', 'date']
Procedure scope ['procedure', 'scope', 'procedure', 'scope']
Contract Type ['contract', 'type', 'contract', 'type']
Procedure type ['procedure', 'type', 'procedure', 'type']
Submission form type ['submission', 'form', 'type', 'submission', 'form', 'type']
Contract ID ['contract', 'id', 'contract', 'id']
Contract Title ['contract', 'title', 'contract', 'title']
Contract start date ['contract', 'start', 'date', 'contract', 'start', 'date']
contract end date ['contract', 'end', 'date', 'contract', 'end', 'date']
contract value ['contract', 'value', 'contract', 'value']
Currency type ['currency', 'type', 'currency', 'type']
contract status ['contract', 'status', 'contract', 'status']
archive status ['archive', 'status', 'archive', 'status']
Unit Branch ['unit', 'branch', 'unit', 'branch']
Program Key ['program', 'key', 'program', 'key']
Federal contribution ['federal', 'contribution', 'federal', 'contribution']
Signature date ['signature', 'date', 'signature', 'date']
Contract Framework ['contract', 'framework', 'contract', 'framework']
Consolidated purchase ['consolidated', 'purchase', 'consolidated', 'purchase']
Multi-year ['mu', 'lti', 'year']
Unit stratification ['unit', 'stratification', 'unit', 'stratification']
Contractor Legal Name ['contractor', 'legal', 'name', 'contractor', 'legal', 'name']
Contractor stratification ['contractor', 'stratification', 'contractor', 'stratification']
Company state ['company', 'state', 'company', 'state']
Account manager type ['account', 'manager', 'type', 'account', 'manager', 'type']
announcement URL ['announcement', 'url', 'announcement', 'url']

=== Columbia ===
Sector Code ['sector', 'code', 'sector', 'code']
Name of the Sector ['name', 'of', 'the', 'sector', 'name', 'of', 'the', 'sector']
Executing Unit Code ['executing', 'unit', 'code', 'executing', 'unit', 'code']
Name of the unit ['name', 'of', 'the', 'unit', 'name', 'of', 'the', 'unit']
Budget Type ['budget', 'type', 'budget', 'type']
Budget Category ['budget', 'category', 'budget', 'category']
Name of the budget category ['name', 'of', 'the', 'budget', 'category', 'name', 'of', 'the', 'budget', 'category']
Funding Source ['funding', 'source', 'funding', 'source']
Name of the Budget ['name', 'of', 'the', 'budget', 'name', 'of', 'the', 'budget']
Budget Status ['budget', 'status', 'budget', 'status']
Month ['month']
Initial Appropriation ['initial', 'appropriation', 'initial', 'appropriation']
Current Appropiation ['current', 'appropiation', 'current', 'appropiat', 'ion']
Value of the Engagement ['value', 'of', 'the', 'engagement', 'value', 'of', 'the', 'engagement']
Value of the obligations ['value', 'of', 'the', 'obligations', 'value', 'of', 'the', 'obligations']
Total Value of the Payment order ['total', 'value', 'of', 'the', 'payment', 'order', 'total', 'value', 'of', 'the', 'payment', 'order']
Engaged Percentage ['engaged', 'percentage', 'engaged', 'percentage']
Mandatory percentage ['mandatory', 'percentage', 'mandatory', 'percentage']
Payed percentage ['payed', 'percentage', 'payed', 'percentage']

=== Georgia ===
id ['id']
procurring_entity_id ['procur', 'ring', 'entity', 'id']
tender_type ['tender', 'type']
tender_registration_number ['tender', 'registration', 'number']
tender_status ['tender', 'status']
tender_announcement_date ['tender', 'announcement', 'date']
bid_start_date ['bid', 'start', 'date']
bid_end_date ['bid', 'end', 'date']
estimated_value ['estimated', 'value']
include_vat ['include', 'vat']
cpv_code ['cpv', 'code']
offer_step ['offer', 'step']
guarantee_amount ['guarantee', 'amount']
guarantee_period ['guarantee', 'period']
created_at ['created', 'at']
updated_at ['updated', 'at']
dataset_id ['data', 'set', 'id']
url_id ['url', 'id']
num_bids ['num', 'bids']
num_bidders ['num', 'bidders']
contract_value ['contract', 'value']
winning_org_id ['winning', 'org', 'id']
risk_indicators ['risk', 'indicators']
procurer_name ['procurer', 'name']
supplier_name ['supplier', 'name']
sub_codes ['sub', 'codes']
inProgress ['in', 'progress']
updated ['updated']
is_new ['is', 'new']
procurer_code ['procurer', 'code']
winner_code ['winner', 'code']

In [65]:


In [ ]: