In [1]:
text1 = "Julia is a high-level high-performance dynamic programming language for numerical computing and Julia is used ..."
len(text1)
Out[1]:
In [2]:
text2 = text1.split(' ')
text2
Out[2]:
In [3]:
len(text2)
Out[3]:
In [4]:
[w for w in text2 if len(w) > 3] # Words that are greater than 3 letters long in text2
Out[4]:
In [5]:
[w for w in text2 if w.istitle()] # Capitalized words in text2
Out[5]:
In [6]:
[w for w in text2 if w.endswith('l')] # Words in text2 that end in 'l'
Out[6]:
We can find unique words using set()
.
In [7]:
len(set(text2))
Out[7]:
In [8]:
set(text2)
Out[8]:
In [9]:
set([w.lower() for w in text2])
Out[9]:
In [10]:
text3 = 'Demystifying Dynamic Programming @freecamp @bostongroup @ NY #algorithms'
text4 = text3.split(' ')
text4
Out[10]:
Finding hastags:
In [11]:
[w for w in text4 if w.startswith('#')]
Out[11]:
Finding callouts:
In [12]:
[w for w in text4 if w.startswith('@')]
Out[12]:
Problem
Finds also single '@'.
Solution
Regular expressions for more complex parsing.
For example '@[A-Za-z0-9_]+'
will return all words that:
'@'
and are followed by at least one: 'A-Z'
)'a-z'
) '0-9'
)'_'
)
In [14]:
import re # for regular expressions
[w for w in text4 if re.search('@[A-Za-z0-9_]+', w)]
Out[14]: