notebook.community

Edit and run



In [1]:

    
import sys, os
import nltk
import pandas as pd



In [2]:

    
from collections import defaultdict
from gensim import corpora, models, similarities
def make_dictionary(documents):
    """
    construct a dictionary, i.e. mapping btwn word ids and their freq of occurence in the whole corpus
    filter dictionary to remove stopwords and words occuring < min_count times
    
    input: documents is an iterable consisting of all the words in the corpus 
    output: filtered dictionary
    """

    
    dictionary = corpora.Dictionary(documents)

    stop_words = nltk.corpus.stopwords.words('english') 
    min_count = 2
    stop_ids = [dictionary.token2id[word] for word in stop_words
               if word in dictionary.token2id]
    rare_ids = [id for id, freq in dictionary.dfs.items()
                if freq < min_count]
    dictionary.filter_tokens(stop_ids + rare_ids)
    dictionary.compactify()
    return(dictionary)

def make_corpus(documents):
    """
    """
    dictionary = make_dictionary(documents)
    # convert corpus to vectors using bag-of-words representation, i.e. tuples of word indices and word counts
    corpus = [dictionary.doc2bow(words) for words in documents]
    return(corpus, dictionary)

def make_lsi_similarity_matrix(tfidf_corpus, dictionary):
    """
    construct LSI (latent semantic indexing) model on Tfidf-transformed corpus, print model topics, 
    return similarity matrix.
    """
    # construct model
    lsi = models.lsimodel.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300) 
    lsi.save('lsi-model.save')
    # create similarity matrix
    matsim = similarities.MatrixSimilarity(lsi[tfidf_corpus], num_best=1000)
    return(matsim)

def make_lda_similarity_matrix(corpus, dictionary):
    """
    Latent Dirichlet Allocation (LDA) model
    """
    # construct model
    lda = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=300)
    lda.save('lda-model.save')
    # create similarity matrix
    matsim = similarities.MatrixSimilarity(lda[corpus], num_best=1000)
    return(matsim)



In [3]:

    
# Read database of data
os.chdir('../data')

output_fname="articles-n-forums-posts.csv"

# Read articles from file
input_fname="AutismParentMagazine-posts-clean.csv"
df=pd.read_csv(input_fname,index_col=0)
df.head(2)
df.index.name='post id'



In [4]:

    
input_fname="MedHelp-posts.csv"
df2=pd.read_csv(input_fname,index_col=0)
df2['source']='http://www.medhelp.org'
df2['category']='forums'

#
del df2['user id']

#Remove questions from forums:
df3=df2.drop(df2.loc[df2.index == df2['mother post id']].index)
del df3['mother post id']
df2=df3
del df3



In [5]:

    
df2.head(5)









    Out[5]:







  
    
      
      title
      text
      href
      source
      category
    
    
      post id
      
      
      
      
      
    
  
  
    
      2
      Inappropriate Masterbation Down Syndrome
      \n        A related discussion, self injusry i...
      http://www.medhelp.org//posts/Autism--Asperger...
      http://www.medhelp.org
      forums
    
    
      3
      Inappropriate Masterbation Down Syndrome
      \n        A related discussion, Sexuality was ...
      http://www.medhelp.org//posts/Autism--Asperger...
      http://www.medhelp.org
      forums
    
    
      5
      Suicidal 8 yr. old Asperger grandson
      \n        i too have an 8yr old,my son jesse s...
      http://www.medhelp.org//posts/Autism--Asperger...
      http://www.medhelp.org
      forums
    
    
      6
      Suicidal 8 yr. old Asperger grandson
      \n        Your e-mail address was blocked out....
      http://www.medhelp.org//posts/Autism--Asperger...
      http://www.medhelp.org
      forums
    
    
      7
      Suicidal 8 yr. old Asperger grandson
      \n        I don't know about the suicuidal ten...
      http://www.medhelp.org//posts/Autism--Asperger...
      http://www.medhelp.org
      forums



In [6]:

    
# Join datasets
df=df.append(df2,ignore_index=True)



In [7]:

    
# Join the two very similar categories into one.

cat1='category-general'
cat2='category-autism-articles'

row_index=df.loc[df['category']==cat2].index
for row in row_index:
    df.loc[row,['category']]=cat1
    
cat1='category-autism-therapy'
cat2='category-applied-behavior-analysis-aba'
row_index=df.loc[df['category']==cat2].index
for row in row_index:
    df.loc[row,['category']]=cat1
    
cat1='category-autism-and-diet'
cat2='category-autism-and-food'
row_index=df.loc[df['category']==cat2].index
for row in row_index:
    df.loc[row,['category']]=cat1



In [8]:

    
# Make categories, a list of categories in case of duplicates:

# Find list of unique titles
unique_titles=df['title'].unique()

# Find list of categories for each title:
dic_category={}
for title in unique_titles:
    cat_list=df.loc[df['title']==title]['category'].values
    # Remove repated elements, by converting to set and back to list
    cat_set=set(cat_list)
    cat_list=list(cat_set)
    dic_category[title]=cat_list



In [9]:

    
# Check if there are repeated elements, and make category a list.
df=df.drop_duplicates('title')
df=df.reset_index(drop=True)



In [10]:

    
for ii in df.index:
    title=df.loc[ii,['title']].values[0]
    category=dic_category[title]
    df.loc[ii,['category']]=str(category)



In [11]:

    
df.head(2)









    Out[11]:







  
    
      
      category
      href
      source
      text
      title
    
  
  
    
      0
      ['category-autism-therapy']
      https://www.autismparentingmagazine.com/autism...
      https://www.autismparentingmagazine.com/
      For children with autism spectrum disorder (AS...
      Autism, Head Banging and other Self Harming Be...
    
    
      1
      ['category-autism-therapy']
      https://www.autismparentingmagazine.com/high-q...
      https://www.autismparentingmagazine.com/
      Dr. Stephen Shore once said “If you’ve met one...
      High Quality ABA Treatment:  What Every Parent...



In [12]:

    
# Tokenize data
import nltk
tokenizer = nltk.RegexpTokenizer(r'\w+')

# Get list of tokens from text in first article:
text = df['text'][0].lower()

# Add also title:
title = df['title'][0].lower()

ttext = tokenizer.tokenize(title+" "+text)
print( text )
print( ttext )









    



for children with autism spectrum disorder (asd), head banging is a common way to self-soothe and communicate needs. both neurotypical and autistic babies and toddlers seek to recreate the rhythm that stimulated their vestibular system while in utero. other rhythmic habits that fuel a child’s kinesthetic drive include head rolling, body rocking, biting, and thumb… 

['autism', 'head', 'banging', 'and', 'other', 'self', 'harming', 'behavior', 'for', 'children', 'with', 'autism', 'spectrum', 'disorder', 'asd', 'head', 'banging', 'is', 'a', 'common', 'way', 'to', 'self', 'soothe', 'and', 'communicate', 'needs', 'both', 'neurotypical', 'and', 'autistic', 'babies', 'and', 'toddlers', 'seek', 'to', 'recreate', 'the', 'rhythm', 'that', 'stimulated', 'their', 'vestibular', 'system', 'while', 'in', 'utero', 'other', 'rhythmic', 'habits', 'that', 'fuel', 'a', 'child', 's', 'kinesthetic', 'drive', 'include', 'head', 'rolling', 'body', 'rocking', 'biting', 'and', 'thumb']



In [13]:

    
# Get a column with list of tokens:

# 1) convert to lower case 
# 2) get tokens
# 2) save data in a new column (tokens)
#df['tokens'] = df['text'].map(lambda x: tokenizer.tokenize(x.lower()))

# Join title and text into one column
df2=df[['title','text']].apply(lambda x: ','.join(x.astype(str)),axis=1)
df['tokens'] = df2.map(lambda x: tokenizer.tokenize(x.lower()))
del df2



In [14]:

    
# Short version of text
df['text_short']=df['text'].apply(lambda x: x[:300] if (len(x) > 300) else x)



In [15]:

    
print(df.loc[0,'text'])
print(df.loc[0,'text_short'])









    



For children with autism spectrum disorder (ASD), head banging is a common way to self-soothe and communicate needs. Both neurotypical and autistic babies and toddlers seek to recreate the rhythm that stimulated their vestibular system while in utero. Other rhythmic habits that fuel a child’s kinesthetic drive include head rolling, body rocking, biting, and thumb… 

For children with autism spectrum disorder (ASD), head banging is a common way to self-soothe and communicate needs. Both neurotypical and autistic babies and toddlers seek to recreate the rhythm that stimulated their vestibular system while in utero. Other rhythmic habits that fuel a child’s kinest



In [16]:

    
df['tokens'].head(5)

# Save dataframe with tokens into files
df.to_csv(output_fname)



In [17]:

    
# Get similarity matrices
documents = df['tokens'].values
print(documents[:3])
corpus,dictionary = make_corpus(documents)

#Save corpus into file
import pickle
pickle.dump(dictionary,open("dictionary.save","wb"))
pickle.dump(corpus,open("corpus.save", "wb"))

tfidf = models.TfidfModel(corpus)
tfidf.save('tfidf.save')

lsi_matsim = make_lsi_similarity_matrix(tfidf[corpus], dictionary)
lda_matsim = make_lda_similarity_matrix(corpus, dictionary)

# The models are saved into files in the above routines
# Save similarity matrices too:
pickle.dump(lsi_matsim,open("lsi-matsim.save","wb"))
pickle.dump(lda_matsim,open("lda-matsim.save","wb"))









    



[ ['autism', 'head', 'banging', 'and', 'other', 'self', 'harming', 'behavior', 'for', 'children', 'with', 'autism', 'spectrum', 'disorder', 'asd', 'head', 'banging', 'is', 'a', 'common', 'way', 'to', 'self', 'soothe', 'and', 'communicate', 'needs', 'both', 'neurotypical', 'and', 'autistic', 'babies', 'and', 'toddlers', 'seek', 'to', 'recreate', 'the', 'rhythm', 'that', 'stimulated', 'their', 'vestibular', 'system', 'while', 'in', 'utero', 'other', 'rhythmic', 'habits', 'that', 'fuel', 'a', 'child', 's', 'kinesthetic', 'drive', 'include', 'head', 'rolling', 'body', 'rocking', 'biting', 'and', 'thumb']
 ['high', 'quality', 'aba', 'treatment', 'what', 'every', 'parent', 'needs', 'to', 'know', 'dr', 'stephen', 'shore', 'once', 'said', 'if', 'you', 've', 'met', 'one', 'person', 'with', 'autism', 'you', 've', 'met', 'one', 'person', 'with', 'autism', 'as', 'many', 'of', 'you', 'know', 'the', 'centre', 'for', 'disease', 'control', 'and', 'prevention', 'reports', 'that', 'there', 'are', 'now', '1', 'in', '68', 'children', 'living', 'with', 'autism', 'in', 'the', 'united', 'states', 'what', 'this', 'means', 'for', 'each', 'child', 'and', 'his', 'her', 'family']
 ['help', 'i', 'don', 't', 'know', 'how', 'to', 'choose', 'an', 'applied', 'behavior', 'analysis', 'provider', 'help', 'i', 'am', 'going', 'to', 'be', 'starting', 'applied', 'behavior', 'analysis', 'for', 'my', 'son', 'who', 'has', 'autism', 'can', 'you', 'tell', 'me', 'things', 'to', 'ask', 'and', 'what', 'to', 'look', 'for', 'in', 'an', 'aba', 'provider', 'jane', 'i', 'am', 'so', 'glad', 'you', 'asked', 'this', 'within', 'the', 'autism', 'community', 'there', 'are', 'mixed', 'feelings', 'about', 'applied', 'behavior', 'analysis', 'aba', 'but']]



In [ ]:

	title	text	href	source	category
post id
2	Inappropriate Masterbation Down Syndrome	\n A related discussion, self injusry i...	http://www.medhelp.org//posts/Autism--Asperger...	http://www.medhelp.org	forums
3	Inappropriate Masterbation Down Syndrome	\n A related discussion, Sexuality was ...	http://www.medhelp.org//posts/Autism--Asperger...	http://www.medhelp.org	forums
5	Suicidal 8 yr. old Asperger grandson	\n i too have an 8yr old,my son jesse s...	http://www.medhelp.org//posts/Autism--Asperger...	http://www.medhelp.org	forums
6	Suicidal 8 yr. old Asperger grandson	\n Your e-mail address was blocked out....	http://www.medhelp.org//posts/Autism--Asperger...	http://www.medhelp.org	forums
7	Suicidal 8 yr. old Asperger grandson	\n I don't know about the suicuidal ten...	http://www.medhelp.org//posts/Autism--Asperger...	http://www.medhelp.org	forums

	category	href	source	text	title
0	['category-autism-therapy']	https://www.autismparentingmagazine.com/autism...	https://www.autismparentingmagazine.com/	For children with autism spectrum disorder (AS...	Autism, Head Banging and other Self Harming Be...
1	['category-autism-therapy']	https://www.autismparentingmagazine.com/high-q...	https://www.autismparentingmagazine.com/	Dr. Stephen Shore once said “If you’ve met one...	High Quality ABA Treatment: What Every Parent...