>>> from urllib import urlopen
>>> url = "http://www.google.com"
>>> raw = urlopen(url).read()
>>> tokens = nltk.word_tokenize(raw)
>>> text = nltk.Text(tokens)
>>> text.collocations()
>>> f = open('document.txt') >>> raw = f.read()
>>> import os >>> os.listdir('.')
>>> f = open('document.txt', 'rU')
>>> for line in f:
... print line.strip()
String:
Method | Functionality |
---|---|
s.find(t) | index of first instance of string t inside s (-1 if not found) |
s.rfind(t) | index of last instance of string t inside s (-1 if not found) |
s.index(t) | like s.find(t) except it raises ValueError if not found |
s.rindex(t) | like s.rfind(t) except it raises ValueError if not found |
s.join(text) | combine the words of the text into a string using s as the glue |
s.split(t) | split s into a list wherever a t is found (whitespace by default) |
s.splitlines() | split s into a list of strings, one per line |
s.lower() | a lowercased version of the string s |
s.upper() | an uppercased version of the string s |
s.title() | a titlecased version of the string s |
s.strip() | a copy of s without leading or trailing whitespace |
s.replace(t, u) | replace instances of t with u inside s |
>>> ord('a')
97
Regular Expression
>>> import re
>>> wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
>>> ed = [w for w in wordlist if re.search('ed$', w)] //end with 'ed'
>>> [w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]
['gold', 'golf', 'hold', 'hole']
>>> wsj = sorted(set(nltk.corpus.treebank.words()))
>>> a = [w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)] //n.n
>>> [w for w in wsj if re.search('^[A-Z]+\$$', w)]
['C$', 'US$']
>>> b = [w for w in wsj if re.search('^[0-9]{4}$', w)] //4 digits eg, 1987 1002
>>> [w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)]
['black-and-white', 'bread-and-butter', 'father-in-law', 'machine-gun-toting', 'savings-and-loan']
>>> [w for w in wsj if re.search('(ed|ing)$', w)]
Operator | Behavior |
---|---|
. | Wildcard, matches any character |
^abc | Matches some pattern abc at the start of a string |
abc$ | Matches some pattern abc at the end of a string |
[abc] | Matches one of a set of characters |
[A-Z0-9] | Matches one of a range of characters |
ed|ing|s | Matches one of the specified strings (disjunction) |
* | Zero or more of previous item, e.g. a*, [a-z]* (also known as Kleene Closure) |
+ | One or more of previous item, e.g. a+, [a-z]+ |
? | Zero or one of the previous item (i.e. optional), e.g. a?, [a-z]? |
{n} | Exactly n repeats where n is a non-negative integer |
{n,} | At least n repeats |
{,n} | No more than n repeats |
{m,n} | At least m and no more than n repeats |
a(b|c)+ | Parentheses that indicate the scope of the operators |
>>> word = 'supercalifragilisticexpialidocious'
>>> re.findall(r'[aeiou]', word)
['u', 'e', 'a', 'i', 'a', 'i', 'i', 'i', 'e', 'i', 'a'....]
def stem(word): ... for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']: ... if word.endswith(suffix): ... return word[:-len(suffix)] ... return word
re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing') ['ing']
>>> re.findall('^.*ing$', 'doing')
['doing']
>>> re.findall('^.*(ing)$', 'doing')
['ing']
>>> re.findall('^.*(?:ing)$', 'doing')
['doing']
>>> re.findall('^(.*)ing$', 'doing')
['do']
>>> re.findall('^(.*)(ing)$', 'doing')
[('do', 'ing')]
>>> def stem(word): ... regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$' ... stem, suffix = re.findall(regexp, word)[0] ... return stem
>>> moby.findall(r"<a> (<.*>) <man>") monied; nervous; dangerous; white; white; white; pious; queer; good; mature; white; Cape; great; wise; wise; butterless; white; fiendish; pale; furious; better; certain; complete; dismasted; younger; brave; brave; brave; brave
Symbol | Function |
---|---|
\b | Word boundary (zero width) |
\d | Any decimal digit (equivalent to [0-9]) |
\D | Any non-digit character (equivalent to [^0-9]) |
\s | Any whitespace character (equivalent to [ \t\n\r\f\v] |
\S | Any non-whitespace character (equivalent to [^ \t\n\r\f\v]) |
\w | Any alphanumeric character (equivalent to [a-zA-Z0-9_]) |
\W | Any non-alphanumeric character (equivalent to [^a-zA-Z0-9_]) |
\t | The tab character |
\n | The newline character |
>>> porter = nltk.PorterStemmer()
>>> [porter.stem(t) for t in tokens]
>>> wnl = nltk.WordNetLemmatizer()
>>> [wnl.lemmatize(t) for t in tokens]