#### PATTERN | TEXT | PARSER #######################################################################
# -*- coding: utf-8 -*-
# Copyright (c) 2010 University of Antwerp, Belgium
# Author: Tom De Smedt <tom@organisms.be>
# License: BSD (see LICENSE.txt for details).
# http://www.clips.ua.ac.be/pages/pattern
# Source: https://github.com/clips/pattern/pattern/text/__init__.py
# git-commit: 2014-05-10 (2f944cb)
# Modified: 2015-05-02 Markus Killer <m.killer@langui.ch>
####################################################################################################
from __future__ import unicode_literals, absolute_import
import os
import sys
import re
import string
import types
import codecs
from xml.etree import cElementTree
from itertools import chain
from math import log
try:
MODULE = os.path.dirname(os.path.realpath(__file__))
except:
MODULE = ""
from _pattern.compat import text_type, string_types, basestring, imap, unicode, binary_type, PY2
from _pattern.text.tree import Tree, Text, Sentence, Slice, Chunk, PNPChunk, Chink, Word, table
from _pattern.text.tree import SLASH, WORD, POS, CHUNK, PNP, REL, ANCHOR, LEMMA, AND, OR
#--- STRING FUNCTIONS ------------------------------------------------------------------------------
# Latin-1 (ISO-8859-1) encoding is identical to Windows-1252 except for the code points 128-159:
# Latin-1 assigns control codes in this range, Windows-1252 has characters, punctuation, symbols
# assigned to these code points.
# String functions
def decode_string(v, encoding="utf-8"):
""" Returns the given value as a Unicode string (if possible).
"""
if isinstance(encoding, basestring):
encoding = ((encoding,),) + (("windows-1252",), ("utf-8", "ignore"))
if isinstance(v, binary_type):
for e in encoding:
try:
return v.decode(*e)
except:
pass
return v
return unicode(v)
def encode_string(v, encoding="utf-8"):
""" Returns the given value as a Python byte string (if possible).
"""
if isinstance(encoding, basestring):
encoding = ((encoding,),) + (("windows-1252",), ("utf-8", "ignore"))
if isinstance(v, unicode):
for e in encoding:
try:
return v.encode(*e)
except:
pass
return v
return str(v)
decode_utf8 = decode_string
encode_utf8 = encode_string
decode_utf8 = decode_string
encode_utf8 = encode_string
PUNCTUATION = ".,;:!?()[]{}`'\"@#$^&*+-|=~_"
def ngrams(string, n=3, punctuation=PUNCTUATION, continuous=False):
""" Returns a list of n-grams (tuples of n successive words) from the given string.
Alternatively, you can supply a Text or Sentence object.
With continuous=False, n-grams will not run over sentence markers (i.e., .!?).
Punctuation marks are stripped from words.
"""
def strip_punctuation(s, punctuation=set(punctuation)):
return [w for w in s if (isinstance(w, Word) and w.string or w) not in punctuation]
if n <= 0:
return []
if isinstance(string, basestring):
s = [strip_punctuation(s.split(" ")) for s in tokenize(string)]
if isinstance(string, Sentence):
s = [strip_punctuation(string)]
if isinstance(string, Text):
s = [strip_punctuation(s) for s in string]
if continuous:
s = [sum(s, [])]
g = []
for s in s:
#s = [None] + s + [None]
g.extend([tuple(s[i:i+n]) for i in range(len(s)-n+1)])
return g
def deflood(s, n=3):
""" Returns the string with no more than n repeated characters, e.g.,
deflood("NIIIICE!!", n=1) => "Nice!"
deflood("nice.....", n=3) => "nice..."
"""
if n == 0:
return s[0:0]
return re.sub(r"((.)\2{%s,})" % (n-1), lambda m: m.group(1)[0] * n, s)
def pprint(string, token=[WORD, POS, CHUNK, PNP], column=4):
""" Pretty-prints the output of Parser.parse() as a table with outlined columns.
Alternatively, you can supply a tree.Text or tree.Sentence object.
"""
if isinstance(string, basestring):
print("\n\n".join([table(sentence, fill=column) for sentence in Text(string, token)]))
if isinstance(string, Text):
print("\n\n".join([table(sentence, fill=column) for sentence in string]))
if isinstance(string, Sentence):
print(table(string, fill=column))
#--- LAZY DICTIONARY -------------------------------------------------------------------------------
# A lazy dictionary is empty until one of its methods is called.
# This way many instances (e.g., lexicons) can be created without using memory until used.
class lazydict(dict):
def load(self):
# Must be overridden in a subclass.
# Must load data with dict.__setitem__(self, k, v) instead of lazydict[k] = v.
pass
def _lazy(self, method, *args):
""" If the dictionary is empty, calls lazydict.load().
Replaces lazydict.method() with dict.method() and calls it.
"""
if dict.__len__(self) == 0:
self.load()
setattr(self, method, types.MethodType(getattr(dict, method), self))
return getattr(dict, method)(self, *args)
def __repr__(self):
return self._lazy("__repr__")
def __len__(self):
return self._lazy("__len__")
def __iter__(self):
return self._lazy("__iter__")
def __contains__(self, *args):
return self._lazy("__contains__", *args)
def __getitem__(self, *args):
return self._lazy("__getitem__", *args)
def __setitem__(self, *args):
return self._lazy("__setitem__", *args)
def setdefault(self, *args):
return self._lazy("setdefault", *args)
def get(self, *args, **kwargs):
return self._lazy("get", *args)
def items(self):
return self._lazy("items")
def keys(self):
return self._lazy("keys")
def values(self):
return self._lazy("values")
def update(self, *args):
return self._lazy("update", *args)
def pop(self, *args):
return self._lazy("pop", *args)
def popitem(self, *args):
return self._lazy("popitem", *args)
class lazylist(list):
def load(self):
# Must be overridden in a subclass.
# Must load data with list.append(self, v) instead of lazylist.append(v).
pass
def _lazy(self, method, *args):
""" If the list is empty, calls lazylist.load().
Replaces lazylist.method() with list.method() and calls it.
"""
if list.__len__(self) == 0:
self.load()
setattr(self, method, types.MethodType(getattr(list, method), self))
return getattr(list, method)(self, *args)
def __repr__(self):
return self._lazy("__repr__")
def __len__(self):
return self._lazy("__len__")
def __iter__(self):
return self._lazy("__iter__")
def __contains__(self, *args):
return self._lazy("__contains__", *args)
def insert(self, *args):
return self._lazy("insert", *args)
def append(self, *args):
return self._lazy("append", *args)
def extend(self, *args):
return self._lazy("extend", *args)
def remove(self, *args):
return self._lazy("remove", *args)
def pop(self, *args):
return self._lazy("pop", *args)
#### PARSER ########################################################################################
# Pattern's text parsers are based on Brill's algorithm, or optionally on a trained language model.
# Brill's algorithm automatically acquires a lexicon of known words (aka tag dictionary),
# and a set of rules for tagging unknown words from a training corpus.
# Morphological rules are used to tag unknown words based on word suffixes (e.g., -ly = adverb).
# Contextual rules are used to tag unknown words based on a word's role in the sentence.
# Named entity rules are used to annotate proper nouns (NNP's: Google = NNP-ORG).
# When available, the parser will use a faster and more accurate language model (SLP, SVM, NB, ...).
#--- LEXICON ---------------------------------------------------------------------------------------
def _read(path, encoding="utf-8", comment=";;;"):
""" Returns an iterator over the lines in the file at the given path,
strippping comments and decoding each line to Unicode.
"""
if path:
if isinstance(path, basestring) and os.path.exists(path):
# From file path.
if PY2:
f = codecs.open(path, 'r', encoding='utf-8')
else:
f = open(path, 'r', encoding='utf-8')
elif isinstance(path, basestring):
# From string.
f = path.splitlines()
else:
# From file or buffer.
f = path
for i, line in enumerate(f):
line = line.strip(codecs.BOM_UTF8) if i == 0 and isinstance(line, binary_type) else line
line = line.strip()
line = decode_utf8(line, encoding)
if not line or (comment and line.startswith(comment)):
continue
yield line
return
class Lexicon(lazydict):
def __init__(self, path=""):
""" A dictionary of known words and their part-of-speech tags.
"""
self._path = path
@property
def path(self):
return self._path
def load(self):
# Arnold NNP x
dict.update(self, (x.split(" ")[:2] for x in _read(self._path)))
#--- FREQUENCY -------------------------------------------------------------------------------------
class Frequency(lazydict):
def __init__(self, path=""):
""" A dictionary of words and their relative document frequency.
"""
self._path = path
@property
def path(self):
return self._path
def load(self):
# and 0.4805
for x in _read(self.path):
x = x.split()
dict.__setitem__(self, x[0], float(x[1]))
#--- LANGUAGE MODEL --------------------------------------------------------------------------------
# A language model determines the statistically most probable tag for an unknown word.
# A pattern.vector Classifier such as SLP can be used to produce a language model,
# by generalizing patterns from a treebank (i.e., a corpus of hand-tagged texts).
# For example:
# "generalizing/VBG from/IN patterns/NNS" and
# "dancing/VBG with/IN squirrels/NNS"
# both have a pattern -ing/VBG + [?] + NNS => IN.
# Unknown words preceded by -ing and followed by a plural noun will be tagged IN (preposition),
# unless (put simply) a majority of other patterns learned by the classifier disagrees.
class Model(object):
def __init__(self, path="", classifier=None, known=set(), unknown=set()):
""" A language model using a classifier (e.g., SLP, SVM) trained on morphology and context.
"""
try:
from pattern.vector import Classifier
from pattern.vector import Perceptron
except ImportError:
sys.path.insert(0, os.path.join(MODULE, ".."))
from vector import Classifier
from vector import Perceptron
self._path = path
# Use a property instead of a subclass, so users can choose their own classifier.
self._classifier = Classifier.load(path) if path else classifier or Perceptron()
# Parser.lexicon entries can be ambiguous (e.g., about/IN is RB 25% of the time).
# Parser.lexicon entries also in Model.unknown are overruled by the model.
# Parser.lexicon entries also in Model.known are not learned by the model
# (only their suffix and context is learned, see Model._v() below).
self.unknown = unknown | self._classifier._data.get("model_unknown", set())
self.known = known
@property
def path(self):
return self._path
@classmethod
def load(self, lexicon={}, path=""):
return Model(lexicon, path)
def save(self, path, final=True):
self._classifier._data["model_unknown"] = self.unknown
self._classifier.save(path, final) # final = unlink training data (smaller file).
def train(self, token, tag, previous=None, next=None):
""" Trains the model to predict the given tag for the given token,
in context of the given previous and next (token, tag)-tuples.
"""
self._classifier.train(self._v(token, previous, next), type=tag)
def classify(self, token, previous=None, next=None, **kwargs):
""" Returns the predicted tag for the given token,
in context of the given previous and next (token, tag)-tuples.
"""
return self._classifier.classify(self._v(token, previous, next), **kwargs)
def apply(self, token, previous=(None, None), next=(None, None)):
""" Returns a (token, tag)-tuple for the given token,
in context of the given previous and next (token, tag)-tuples.
"""
return [token[0], self._classifier.classify(self._v(token[0], previous, next))]
def _v(self, token, previous=None, next=None):
""" Returns a training vector for the given (word, tag)-tuple and its context.
"""
def f(v, s1, s2):
if s2:
v[s1 + " " + s2] = 1
p, n = previous, next
p = ("", "") if not p else (p[0] or "", p[1] or "")
n = ("", "") if not n else (n[0] or "", n[1] or "")
v = {}
f(v, "b", "b") # Bias.
f(v, "h", token[0]) # Capitalization.
f(v, "w", token[-6:] if token not in self.known or token in self.unknown else "")
f(v, "x", token[-3:]) # Word suffix.
f(v, "-x", p[0][-3:]) # Word suffix left.
f(v, "+x", n[0][-3:]) # Word suffix right.
f(v, "-t", p[1]) # Tag left.
f(v, "-+", p[1] + n[1]) # Tag left + right.
f(v, "+t", n[1]) # Tag right.
return v
def _get_description(self):
return self._classifier.description
def _set_description(self, s):
self._classifier.description = s
description = property(_get_description, _set_description)
#--- MORPHOLOGICAL RULES ---------------------------------------------------------------------------
# Brill's algorithm generates lexical (i.e., morphological) rules in the following format:
# NN s fhassuf 1 NNS x => unknown words ending in -s and tagged NN change to NNS.
# ly hassuf 2 RB x => unknown words ending in -ly change to RB.
class Morphology(lazylist):
def __init__(self, path="", known={}):
""" A list of rules based on word morphology (prefix, suffix).
"""
self.known = known
self._path = path
self._cmd = set((
"word", # Word is x.
"char", # Word contains x.
"haspref", # Word starts with x.
"hassuf", # Word end with x.
"addpref", # x + word is in lexicon.
"addsuf", # Word + x is in lexicon.
"deletepref", # Word without x at the start is in lexicon.
"deletesuf", # Word without x at the end is in lexicon.
"goodleft", # Word preceded by word x.
"goodright", # Word followed by word x.
))
self._cmd.update([("f" + x) for x in self._cmd])
@property
def path(self):
return self._path
def load(self):
# ["NN", "s", "fhassuf", "1", "NNS", "x"]
list.extend(self, (x.split() for x in _read(self._path)))
def apply(self, token, previous=(None, None), next=(None, None)):
""" Applies lexical rules to the given token, which is a [word, tag] list.
"""
w = token[0]
for r in self:
if r[1] in self._cmd: # Rule = ly hassuf 2 RB x
f, x, pos, cmd = bool(0), r[0], r[-2], r[1].lower()
if r[2] in self._cmd: # Rule = NN s fhassuf 1 NNS x
f, x, pos, cmd = bool(1), r[1], r[-2], r[2].lower().lstrip("f")
if f and token[1] != r[0]:
continue
if (cmd == "word" and x == w) \
or (cmd == "char" and x in w) \
or (cmd == "haspref" and w.startswith(x)) \
or (cmd == "hassuf" and w.endswith(x)) \
or (cmd == "addpref" and x + w in self.known) \
or (cmd == "addsuf" and w + x in self.known) \
or (cmd == "deletepref" and w.startswith(x) and w[len(x):] in self.known) \
or (cmd == "deletesuf" and w.endswith(x) and w[:-len(x)] in self.known) \
or (cmd == "goodleft" and x == next[0]) \
or (cmd == "goodright" and x == previous[0]):
token[1] = pos
return token
def insert(self, i, tag, affix, cmd="hassuf", tagged=None):
""" Inserts a new rule that assigns the given tag to words with the given affix,
e.g., Morphology.append("RB", "-ly").
"""
if affix.startswith("-") and affix.endswith("-"):
affix, cmd = affix[+1:-1], "char"
if affix.startswith("-"):
affix, cmd = affix[+1:-0], "hassuf"
if affix.endswith("-"):
affix, cmd = affix[+0:-1], "haspref"
if tagged:
r = [tagged, affix, "f"+cmd.lstrip("f"), tag, "x"]
else:
r = [affix, cmd.lstrip("f"), tag, "x"]
lazylist.insert(self, i, r)
def append(self, *args, **kwargs):
self.insert(len(self)-1, *args, **kwargs)
def extend(self, rules=[]):
for r in rules:
self.append(*r)
#--- CONTEXT RULES ---------------------------------------------------------------------------------
# Brill's algorithm generates contextual rules in the following format:
# VBD VB PREVTAG TO => unknown word tagged VBD changes to VB if preceded by a word tagged TO.
class Context(lazylist):
def __init__(self, path=""):
""" A list of rules based on context (preceding and following words).
"""
self._path = path
self._cmd = set((
"prevtag", # Preceding word is tagged x.
"nexttag", # Following word is tagged x.
"prev2tag", # Word 2 before is tagged x.
"next2tag", # Word 2 after is tagged x.
"prev1or2tag", # One of 2 preceding words is tagged x.
"next1or2tag", # One of 2 following words is tagged x.
"prev1or2or3tag", # One of 3 preceding words is tagged x.
"next1or2or3tag", # One of 3 following words is tagged x.
"surroundtag", # Preceding word is tagged x and following word is tagged y.
"curwd", # Current word is x.
"prevwd", # Preceding word is x.
"nextwd", # Following word is x.
"prev1or2wd", # One of 2 preceding words is x.
"next1or2wd", # One of 2 following words is x.
"next1or2or3wd", # One of 3 preceding words is x.
"prev1or2or3wd", # One of 3 following words is x.
"prevwdtag", # Preceding word is x and tagged y.
"nextwdtag", # Following word is x and tagged y.
"wdprevtag", # Current word is y and preceding word is tagged x.
"wdnexttag", # Current word is x and following word is tagged y.
"wdand2aft", # Current word is x and word 2 after is y.
"wdand2tagbfr", # Current word is y and word 2 before is tagged x.
"wdand2tagaft", # Current word is x and word 2 after is tagged y.
"lbigram", # Current word is y and word before is x.
"rbigram", # Current word is x and word after is y.
"prevbigram", # Preceding word is tagged x and word before is tagged y.
"nextbigram", # Following word is tagged x and word after is tagged y.
))
@property
def path(self):
return self._path
def load(self):
# ["VBD", "VB", "PREVTAG", "TO"]
list.extend(self, (x.split() for x in _read(self._path)))
def apply(self, tokens):
""" Applies contextual rules to the given list of tokens,
where each token is a [word, tag] list.
"""
o = [("STAART", "STAART")] * 3 # Empty delimiters for look ahead/back.
t = o + tokens + o
for i, token in enumerate(t):
for r in self:
if token[1] == "STAART":
continue
if token[1] != r[0] and r[0] != "*":
continue
cmd, x, y = r[2], r[3], r[4] if len(r) > 4 else ""
cmd = cmd.lower()
if (cmd == "prevtag" and x == t[i-1][1]) \
or (cmd == "nexttag" and x == t[i+1][1]) \
or (cmd == "prev2tag" and x == t[i-2][1]) \
or (cmd == "next2tag" and x == t[i+2][1]) \
or (cmd == "prev1or2tag" and x in (t[i-1][1], t[i-2][1])) \
or (cmd == "next1or2tag" and x in (t[i+1][1], t[i+2][1])) \
or (cmd == "prev1or2or3tag" and x in (t[i-1][1], t[i-2][1], t[i-3][1])) \
or (cmd == "next1or2or3tag" and x in (t[i+1][1], t[i+2][1], t[i+3][1])) \
or (cmd == "surroundtag" and x == t[i-1][1] and y == t[i+1][1]) \
or (cmd == "curwd" and x == t[i+0][0]) \
or (cmd == "prevwd" and x == t[i-1][0]) \
or (cmd == "nextwd" and x == t[i+1][0]) \
or (cmd == "prev1or2wd" and x in (t[i-1][0], t[i-2][0])) \
or (cmd == "next1or2wd" and x in (t[i+1][0], t[i+2][0])) \
or (cmd == "prevwdtag" and x == t[i-1][0] and y == t[i-1][1]) \
or (cmd == "nextwdtag" and x == t[i+1][0] and y == t[i+1][1]) \
or (cmd == "wdprevtag" and x == t[i-1][1] and y == t[i+0][0]) \
or (cmd == "wdnexttag" and x == t[i+0][0] and y == t[i+1][1]) \
or (cmd == "wdand2aft" and x == t[i+0][0] and y == t[i+2][0]) \
or (cmd == "wdand2tagbfr" and x == t[i-2][1] and y == t[i+0][0]) \
or (cmd == "wdand2tagaft" and x == t[i+0][0] and y == t[i+2][1]) \
or (cmd == "lbigram" and x == t[i-1][0] and y == t[i+0][0]) \
or (cmd == "rbigram" and x == t[i+0][0] and y == t[i+1][0]) \
or (cmd == "prevbigram" and x == t[i-2][1] and y == t[i-1][1]) \
or (cmd == "nextbigram" and x == t[i+1][1] and y == t[i+2][1]):
t[i] = [t[i][0], r[1]]
return t[len(o):-len(o)]
def insert(self, i, tag1, tag2, cmd="prevtag", x=None, y=None):
""" Inserts a new rule that updates words with tag1 to tag2,
given constraints x and y, e.g., Context.append("TO < NN", "VB")
"""
if " < " in tag1 and not x and not y:
tag1, x = tag1.split(" < "); cmd="prevtag"
if " > " in tag1 and not x and not y:
x, tag1 = tag1.split(" > "); cmd="nexttag"
lazylist.insert(self, i, [tag1, tag2, cmd, x or "", y or ""])
def append(self, *args, **kwargs):
self.insert(len(self)-1, *args, **kwargs)
def extend(self, rules=[]):
for r in rules:
self.append(*r)
#--- NAMED ENTITY RECOGNIZER -----------------------------------------------------------------------
RE_ENTITY1 = re.compile(r"^http://") # http://www.domain.com/path
RE_ENTITY2 = re.compile(r"^www\..*?\.[com|org|net|edu|de|uk]$") # www.domain.com
RE_ENTITY3 = re.compile(r"^[\w\-\.\+]+@(\w[\w\-]+\.)+[\w\-]+$") # name@domain.com
class Entities(lazydict):
def __init__(self, path="", tag="NNP"):
""" A dictionary of named entities and their labels.
For domain names and e-mail adresses, regular expressions are used.
"""
self.tag = tag
self._path = path
self._cmd = ((
"pers", # Persons: George/NNP-PERS
"loc", # Locations: Washington/NNP-LOC
"org", # Organizations: Google/NNP-ORG
))
@property
def path(self):
return self._path
def load(self):
# ["Alexander", "the", "Great", "PERS"]
# {"alexander": [["alexander", "the", "great", "pers"], ...]}
for x in _read(self.path):
x = [x.lower() for x in x.split()]
dict.setdefault(self, x[0], []).append(x)
def apply(self, tokens):
""" Applies the named entity recognizer to the given list of tokens,
where each token is a [word, tag] list.
"""
# Note: we could also scan for patterns, e.g.,
# "my|his|her name is|was *" => NNP-PERS.
i = 0
while i < len(tokens):
w = tokens[i][0].lower()
if RE_ENTITY1.match(w) \
or RE_ENTITY2.match(w) \
or RE_ENTITY3.match(w):
tokens[i][1] = self.tag
if w in self:
for e in self[w]:
# Look ahead to see if successive words match the named entity.
e, tag = (e[:-1], "-"+e[-1].upper()) if e[-1] in self._cmd else (e, "")
b = True
for j, e in enumerate(e):
if i + j >= len(tokens) or tokens[i+j][0].lower() != e:
b = False; break
if b:
for token in tokens[i:i+j+1]:
token[1] = token[1] if token[1].startswith(self.tag) else self.tag
token[1] += tag
i += j
break
i += 1
return tokens
def append(self, entity, name="pers"):
""" Appends a named entity to the lexicon,
e.g., Entities.append("Hooloovoo", "PERS")
"""
e = map(lambda s: s.lower(), entity.split(" ") + [name])
self.setdefault(e[0], []).append(e)
def extend(self, entities):
for entity, name in entities:
self.append(entity, name)
#### PARSER ########################################################################################
#--- PARSER ----------------------------------------------------------------------------------------
# A shallow parser can be used to retrieve syntactic-semantic information from text
# in an efficient way (usually at the expense of deeper configurational syntactic information).
# The shallow parser in Pattern is meant to handle the following tasks:
# 1) Tokenization: split punctuation marks from words and find sentence periods.
# 2) Tagging: find the part-of-speech tag of each word (noun, verb, ...) in a sentence.
# 3) Chunking: find words that belong together in a phrase.
# 4) Role labeling: find the subject and object of the sentence.
# 5) Lemmatization: find the base form of each word ("was" => "is").
# WORD TAG CHUNK PNP ROLE LEMMA
#------------------------------------------------------------------
# The DT B-NP O NP-SBJ-1 the
# black JJ I-NP O NP-SBJ-1 black
# cat NN I-NP O NP-SBJ-1 cat
# sat VB B-VP O VP-1 sit
# on IN B-PP B-PNP PP-LOC on
# the DT B-NP I-PNP NP-OBJ-1 the
# mat NN I-NP I-PNP NP-OBJ-1 mat
# . . O O O .
# The example demonstrates what information can be retrieved:
#
# - the period is split from "mat." = the end of the sentence,
# - the words are annotated: NN (noun), VB (verb), JJ (adjective), DT (determiner), ...
# - the phrases are annotated: NP (noun phrase), VP (verb phrase), PNP (preposition), ...
# - the phrases are labeled: SBJ (subject), OBJ (object), LOC (location), ...
# - the phrase start is marked: B (begin), I (inside), O (outside),
# - the past tense "sat" is lemmatized => "sit".
# By default, the English parser uses the Penn Treebank II tagset:
# http://www.clips.ua.ac.be/pages/penn-treebank-tagset
PTB = PENN = "penn"
class Parser(object):
def __init__(self, lexicon={}, frequency={}, model=None, morphology=None, context=None, entities=None, default=("NN", "NNP", "CD"), language=None):
""" A simple shallow parser using a Brill-based part-of-speech tagger.
The given lexicon is a dictionary of known words and their part-of-speech tag.
The given default tags are used for unknown words.
Unknown words that start with a capital letter are tagged NNP (except for German).
Unknown words that contain only digits and punctuation are tagged CD.
Optionally, morphological and contextual rules (or a language model) can be used
to improve the tags of unknown words.
The given language can be used to discern between
Germanic and Romance languages for phrase chunking.
"""
self.lexicon = lexicon or {}
self.frequency = frequency or {}
self.model = model
self.morphology = morphology
self.context = context
self.entities = entities
self.default = default
self.language = language
# Load data.
f = lambda s: isinstance(s, basestring) or hasattr(s, "read")
if f(lexicon):
# Known words.
self.lexicon = Lexicon(path=lexicon)
if f(frequency):
# Word frequency.
self.frequency= Frequency(path=frequency)
if f(morphology):
# Unknown word rules based on word suffix.
self.morphology = Morphology(path=morphology, known=self.lexicon)
if f(context):
# Unknown word rules based on word context.
self.context = Context(path=context)
if f(entities):
# Named entities.
self.entities = Entities(path=entities, tag=default[1])
if f(model):
# Word part-of-speech classifier.
try:
self.model = Model(path=model)
except ImportError: # pattern.vector
pass
def find_keywords(self, string, **kwargs):
""" Returns a sorted list of keywords in the given string.
"""
return find_keywords(string,
parser = self,
top = kwargs.pop("top", 10),
frequency = kwargs.pop("frequency", {}), **kwargs
)
def find_tokens(self, string, **kwargs):
""" Returns a list of sentences from the given string.
Punctuation marks are separated from each word by a space.
"""
# "The cat purs." => ["The cat purs ."]
return find_tokens(string,
punctuation = kwargs.get( "punctuation", PUNCTUATION),
abbreviations = kwargs.get("abbreviations", ABBREVIATIONS),
replace = kwargs.get( "replace", replacements),
linebreak = r"\n{2,}")
def find_tags(self, tokens, **kwargs):
""" Annotates the given list of tokens with part-of-speech tags.
Returns a list of tokens, where each token is now a [word, tag]-list.
"""
# ["The", "cat", "purs"] => [["The", "DT"], ["cat", "NN"], ["purs", "VB"]]
return find_tags(tokens,
lexicon = kwargs.get( "lexicon", self.lexicon or {}),
model = kwargs.get( "model", self.model),
morphology = kwargs.get("morphology", self.morphology),
context = kwargs.get( "context", self.context),
entities = kwargs.get( "entities", self.entities),
language = kwargs.get( "language", self.language),
default = kwargs.get( "default", self.default),
map = kwargs.get( "map", None))
def find_chunks(self, tokens, **kwargs):
""" Annotates the given list of tokens with chunk tags.
Several tags can be added, for example chunk + preposition tags.
"""
# [["The", "DT"], ["cat", "NN"], ["purs", "VB"]] =>
# [["The", "DT", "B-NP"], ["cat", "NN", "I-NP"], ["purs", "VB", "B-VP"]]
return find_prepositions(
find_chunks(tokens,
language = kwargs.get("language", self.language)))
def find_prepositions(self, tokens, **kwargs):
""" Annotates the given list of tokens with prepositional noun phrase tags.
"""
return find_prepositions(tokens) # See also Parser.find_chunks().
def find_labels(self, tokens, **kwargs):
""" Annotates the given list of tokens with verb/predicate tags.
"""
return find_relations(tokens)
def find_lemmata(self, tokens, **kwargs):
""" Annotates the given list of tokens with word lemmata.
"""
return [token + [token[0].lower()] for token in tokens]
def parse(self, s, tokenize=True, tags=True, chunks=True, relations=False, lemmata=False, encoding="utf-8", **kwargs):
""" Takes a string (sentences) and returns a tagged Unicode string (TaggedString).
Sentences in the output are separated by newlines.
With tokenize=True, punctuation is split from words and sentences are separated by \n.
With tags=True, part-of-speech tags are parsed (NN, VB, IN, ...).
With chunks=True, phrase chunk tags are parsed (NP, VP, PP, PNP, ...).
With relations=True, semantic role labels are parsed (SBJ, OBJ).
With lemmata=True, word lemmata are parsed.
Optional parameters are passed to
the tokenizer, tagger, chunker, labeler and lemmatizer.
"""
# Tokenizer.
if tokenize is True:
s = self.find_tokens(s, **kwargs)
if isinstance(s, (list, tuple)):
s = [isinstance(s, basestring) and s.split(" ") or s for s in s]
if isinstance(s, basestring):
s = [s.split(" ") for s in s.split("\n")]
# Unicode.
for i in range(len(s)):
for j in range(len(s[i])):
if isinstance(s[i][j], str):
s[i][j] = decode_string(s[i][j], encoding)
# Tagger (required by chunker, labeler & lemmatizer).
if tags or chunks or relations or lemmata:
s[i] = self.find_tags(s[i], **kwargs)
else:
s[i] = [[w] for w in s[i]]
# Chunker.
if chunks or relations:
s[i] = self.find_chunks(s[i], **kwargs)
# Labeler.
if relations:
s[i] = self.find_labels(s[i], **kwargs)
# Lemmatizer.
if lemmata:
s[i] = self.find_lemmata(s[i], **kwargs)
# Slash-formatted tagged string.
# With collapse=False (or split=True), returns raw list
# (this output is not usable by tree.Text).
if not kwargs.get("collapse", True) \
or kwargs.get("split", False):
return s
# Construct TaggedString.format.
# (this output is usable by tree.Text).
format = ["word"]
if tags:
format.append("part-of-speech")
if chunks:
format.extend(("chunk", "preposition"))
if relations:
format.append("relation")
if lemmata:
format.append("lemma")
# Collapse raw list.
# Sentences are separated by newlines, tokens by spaces, tags by slashes.
# Slashes in words are encoded with &slash;
for i in range(len(s)):
for j in range(len(s[i])):
s[i][j][0] = s[i][j][0].replace("/", "&slash;")
s[i][j] = "/".join(s[i][j])
s[i] = " ".join(s[i])
s = "\n".join(s)
s = TaggedString(s, format, language=kwargs.get("language", self.language))
return s
#--- TAGGED STRING ---------------------------------------------------------------------------------
# Pattern.parse() returns a TaggedString: a Unicode string with "tags" and "language" attributes.
# The pattern.text.tree.Text class uses this attribute to determine the token format and
# transform the tagged string to a parse tree of nested Sentence, Chunk and Word objects.
TOKENS = "tokens"
class TaggedString(unicode):
def __new__(self, string, tags=["word"], language=None):
""" Unicode string with tags and language attributes.
For example: TaggedString("cat/NN/NP", tags=["word", "pos", "chunk"]).
"""
# From a TaggedString:
if isinstance(string, unicode) and hasattr(string, "tags"):
tags, language = string.tags, string.language
# From a TaggedString.split(TOKENS) list:
if isinstance(string, list):
string = [[[x.replace("/", "&slash;") for x in token] for token in s] for s in string]
string = "\n".join(" ".join("/".join(token) for token in s) for s in string)
s = unicode.__new__(self, string)
s.tags = list(tags)
s.language = language
return s
def split(self, sep=TOKENS):
""" Returns a list of sentences, where each sentence is a list of tokens,
where each token is a list of word + tags.
"""
if sep != TOKENS:
return unicode.split(self, sep)
if len(self) == 0:
return []
return [[[x.replace("&slash;", "/") for x in token.split("/")]
for token in sentence.split(" ")]
for sentence in unicode.split(self, "\n")]
#--- UNIVERSAL TAGSET ------------------------------------------------------------------------------
# The default part-of-speech tagset used in Pattern is Penn Treebank II.
# However, not all languages are well-suited to Penn Treebank (which was developed for English).
# As more languages are implemented, this is becoming more problematic.
#
# A universal tagset is proposed by Slav Petrov (2012):
# http://www.petrovi.de/data/lrec.pdf
#
# Subclasses of Parser should start implementing
# Parser.parse(tagset=UNIVERSAL) with a simplified tagset.
# The names of the constants correspond to Petrov's naming scheme, while
# the value of the constants correspond to Penn Treebank.
UNIVERSAL = "universal"
NOUN, VERB, ADJ, ADV, PRON, DET, PREP, ADP, NUM, CONJ, INTJ, PRT, PUNC, X = \
"NN", "VB", "JJ", "RB", "PR", "DT", "PP", "PP", "NO", "CJ", "UH", "PT", ".", "X"
def penntreebank2universal(token, tag):
""" Returns a (token, tag)-tuple with a simplified universal part-of-speech tag.
"""
if tag.startswith(("NNP-", "NNPS-")):
return (token, "%s-%s" % (NOUN, tag.split("-")[-1]))
if tag in ("NN", "NNS", "NNP", "NNPS", "NP"):
return (token, NOUN)
if tag in ("MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"):
return (token, VERB)
if tag in ("JJ", "JJR", "JJS"):
return (token, ADJ)
if tag in ("RB", "RBR", "RBS", "WRB"):
return (token, ADV)
if tag in ("PRP", "PRP$", "WP", "WP$"):
return (token, PRON)
if tag in ("DT", "PDT", "WDT", "EX"):
return (token, DET)
if tag in ("IN",):
return (token, PREP)
if tag in ("CD",):
return (token, NUM)
if tag in ("CC",):
return (token, CONJ)
if tag in ("UH",):
return (token, INTJ)
if tag in ("POS", "RP", "TO"):
return (token, PRT)
if tag in ("SYM", "LS", ".", "!", "?", ",", ":", "(", ")", "\"", "#", "$"):
return (token, PUNC)
return (token, X)
#--- TOKENIZER -------------------------------------------------------------------------------------
TOKEN = re.compile(r"(\S+)\s")
# Handle common punctuation marks.
PUNCTUATION = \
punctuation = ".,;:!?()[]{}`''\"@#$^&*+-|=~_"
# Handle common abbreviations.
ABBREVIATIONS = abbreviations = set((
"a.", "adj.", "adv.", "al.", "a.m.", "art.", "c.", "capt.", "cert.", "cf.", "col.", "Col.",
"comp.", "conf.", "def.", "Dep.", "Dept.", "Dr.", "dr.", "ed.", "e.g.", "esp.", "etc.", "ex.",
"f.", "fig.", "gen.", "id.", "i.e.", "int.", "l.", "m.", "Med.", "Mil.", "Mr.", "n.", "n.q.",
"orig.", "pl.", "pred.", "pres.", "p.m.", "ref.", "v.", "vs.", "w/"
))
RE_ABBR1 = re.compile("^[A-Za-z]\.$") # single letter, "T. De Smedt"
RE_ABBR2 = re.compile("^([A-Za-z]\.)+$") # alternating letters, "U.S."
RE_ABBR3 = re.compile("^[A-Z][" + "|".join( # capital followed by consonants, "Mr."
"bcdfghjklmnpqrstvwxz") + "]+.$")
# Handle emoticons.
EMOTICONS = { # (facial expression, sentiment)-keys
("love" , +1.00): set(("<3", u"♥", u"❤")),
("grin" , +1.00): set((">:D", ":-D", ":D", "=-D", "=D", "X-D", "x-D", "XD", "xD", "8-D")),
("taunt", +0.75): set((">:P", ":-P", ":P", ":-p", ":p", ":-b", ":b", ":c)", ":o)", ":^)")),
("smile", +0.50): set((">:)", ":-)", ":)", "=)", "=]", ":]", ":}", ":>", ":3", "8)", "8-)")),
("wink" , +0.25): set((">;]", ";-)", ";)", ";-]", ";]", ";D", ";^)", "*-)", "*)")),
("gasp" , +0.05): set((">:o", ":-O", ":O", ":o", ":-o", "o_O", "o.O", u"°O°", u"°o°")),
("worry", -0.25): set((">:/", ":-/", ":/", ":\\", ">:\\", ":-.", ":-s", ":s", ":S", ":-S", ">.>")),
("frown", -0.75): set((">:[", ":-(", ":(", "=(", ":-[", ":[", ":{", ":-<", ":c", ":-c", "=/")),
("cry" , -1.00): set((":'(", ":'''(", ";'("))
}
RE_EMOTICONS = [r" ?".join(map(re.escape, e)) for v in EMOTICONS.values() for e in v]
RE_EMOTICONS = re.compile(r"(%s)($|\s)" % "|".join(RE_EMOTICONS))
# Handle sarcasm punctuation (!).
RE_SARCASM = re.compile(r"\( ?\! ?\)")
# Handle common contractions.
replacements = {
"'d": " 'd",
"'m": " 'm",
"'s": " 's",
"'ll": " 'll",
"'re": " 're",
"'ve": " 've",
"n't": " n't"
}
# Handle paragraph line breaks (\n\n marks end of sentence).
EOS = "END-OF-SENTENCE"
def find_tokens(string, punctuation=PUNCTUATION, abbreviations=ABBREVIATIONS, replace=replacements, linebreak=r"\n{2,}"):
""" Returns a list of sentences. Each sentence is a space-separated string of tokens (words).
Handles common cases of abbreviations (e.g., etc., ...).
Punctuation marks are split from other words. Periods (or ?!) mark the end of a sentence.
Headings without an ending period are inferred by line breaks.
"""
# Handle periods separately.
punctuation = tuple(punctuation.replace(".", ""))
# Handle replacements (contractions).
for a, b in replace.items():
string = re.sub(a, b, string)
# Handle Unicode quotes.
if isinstance(string, unicode):
string = string.replace(u"“", u" “ ")
string = string.replace(u"”", u" ” ")
string = string.replace(u"‘", u" ‘ ")
string = string.replace(u"’", u" ’ ")
# Collapse whitespace.
string = re.sub("\r\n", "\n", string)
string = re.sub(linebreak, " %s " % EOS, string)
string = re.sub(r"\s+", " ", string)
tokens = []
# Handle punctuation marks.
for t in TOKEN.findall(string+" "):
if len(t) > 0:
tail = []
while t.startswith(punctuation) and \
not t in replace:
# Split leading punctuation.
if t.startswith(punctuation):
tokens.append(t[0]); t=t[1:]
while t.endswith(punctuation+(".",)) and \
not t in replace:
# Split trailing punctuation.
if t.endswith(punctuation):
tail.append(t[-1]); t=t[:-1]
# Split ellipsis (...) before splitting period.
if t.endswith("..."):
tail.append("..."); t=t[:-3].rstrip(".")
# Split period (if not an abbreviation).
if t.endswith("."):
if t in abbreviations or \
RE_ABBR1.match(t) is not None or \
RE_ABBR2.match(t) is not None or \
RE_ABBR3.match(t) is not None:
break
else:
tail.append(t[-1]); t=t[:-1]
if t != "":
tokens.append(t)
tokens.extend(reversed(tail))
# Handle sentence breaks (periods, quotes, parenthesis).
sentences, i, j = [[]], 0, 0
while j < len(tokens):
if tokens[j] in ("...", ".", "!", "?", EOS):
while j < len(tokens) \
and tokens[j] in ("'", "\"", u"”", u"’", "...", ".", "!", "?", ")", EOS):
if tokens[j] in ("'", "\"") and sentences[-1].count(tokens[j]) % 2 == 0:
break # Balanced quotes.
j += 1
sentences[-1].extend(t for t in tokens[i:j] if t != EOS)
sentences.append([])
i = j
j += 1
# Handle emoticons.
sentences[-1].extend(tokens[i:j])
sentences = (" ".join(s) for s in sentences if len(s) > 0)
sentences = (RE_SARCASM.sub("(!)", s) for s in sentences)
sentences = [RE_EMOTICONS.sub(
lambda m: m.group(1).replace(" ", "") + m.group(2), s) for s in sentences]
return sentences
#--- PART-OF-SPEECH TAGGER -------------------------------------------------------------------------
# Unknown words are recognized as numbers if they contain only digits and -,.:/%$
CD = re.compile(r"^[0-9\-\,\.\:\/\%\$]+$")
def _suffix_rules(token, tag="NN"):
""" Default morphological tagging rules for English, based on word suffixes.
"""
if isinstance(token, (list, tuple)):
token, tag = token
if token.endswith("ing"):
tag = "VBG"
if token.endswith("ly"):
tag = "RB"
if token.endswith("s") and not token.endswith(("is", "ous", "ss")):
tag = "NNS"
if token.endswith(("able", "al", "ful", "ible", "ient", "ish", "ive", "less", "tic", "ous")) or "-" in token:
tag = "JJ"
if token.endswith("ed"):
tag = "VBN"
if token.endswith(("ate", "ify", "ise", "ize")):
tag = "VBP"
return [token, tag]
def find_tags(tokens, lexicon={}, model=None, morphology=None, context=None, entities=None, default=("NN", "NNP", "CD"), language="en", map=None, **kwargs):
""" Returns a list of [token, tag]-items for the given list of tokens:
["The", "cat", "purs"] => [["The", "DT"], ["cat", "NN"], ["purs", "VB"]]
Words are tagged using the given lexicon of (word, tag)-items.
Unknown words are tagged NN by default.
Unknown words that start with a capital letter are tagged NNP (unless language="de").
Unknown words that consist only of digits and punctuation marks are tagged CD.
Unknown words are then improved with morphological rules.
All words are improved with contextual rules.
If a model is given, uses model for unknown words instead of morphology and context.
If map is a function, it is applied to each (token, tag) after applying all rules.
"""
tagged = []
# Tag known words.
for i, token in enumerate(tokens):
tagged.append([token, lexicon.get(token, i == 0 and lexicon.get(token.lower()) or None)])
# Tag unknown words.
for i, (token, tag) in enumerate(tagged):
prev, next = (None, None), (None, None)
if i > 0:
prev = tagged[i-1]
if i < len(tagged) - 1:
next = tagged[i+1]
if tag is None or token in (model is not None and model.unknown or ()):
# Use language model (i.e., SLP).
if model is not None:
tagged[i] = model.apply([token, None], prev, next)
# Use NNP for capitalized words (except in German).
elif token.istitle() and language != "de":
tagged[i] = [token, default[1]]
# Use CD for digits and numbers.
elif CD.match(token) is not None:
tagged[i] = [token, default[2]]
# Use suffix rules (e.g., -ly = RB).
elif morphology is not None:
tagged[i] = morphology.apply([token, default[0]], prev, next)
# Use suffix rules (English default).
elif language == "en":
tagged[i] = _suffix_rules([token, default[0]])
# Use most frequent tag (NN).
else:
tagged[i] = [token, default[0]]
# Tag words by context.
if context is not None and model is None:
tagged = context.apply(tagged)
# Tag named entities.
if entities is not None:
tagged = entities.apply(tagged)
# Map tags with a custom function.
if map is not None:
tagged = [list(map(token, tag)) or [token, default[0]] for token, tag in tagged]
return tagged
#--- PHRASE CHUNKER --------------------------------------------------------------------------------
SEPARATOR = "/"
NN = r"NN|NNS|NNP|NNPS|NNPS?\-[A-Z]{3,4}|PR|PRP|PRP\$"
VB = r"VB|VBD|VBG|VBN|VBP|VBZ"
JJ = r"JJ|JJR|JJS"
RB = r"(?<!W)RB|RBR|RBS"
CC = r"CC|CJ"
# Chunking rules.
# CHUNKS[0] = Germanic: RB + JJ precedes NN ("the round table").
# CHUNKS[1] = Romance: RB + JJ precedes or follows NN ("la table ronde", "une jolie fille").
CHUNKS = [[
# Germanic languages: en, de, nl, ...
( "NP", r"((NN)/)* ((DT|CD|CC)/)* ((RB|JJ)/)* (((JJ)/(CC|,)/)*(JJ)/)* ((NN)/)+"),
( "VP", r"(((MD|TO|RB)/)* ((VB)/)+ ((RP)/)*)+"),
( "VP", r"((MD)/)"),
( "PP", r"((IN|PP)/)+"),
("ADJP", r"((RB|JJ)/)* ((JJ)/,/)* ((JJ)/(CC)/)* ((JJ)/)+"),
("ADVP", r"((RB)/)+"),
], [
# Romance languages: es, fr, it, ...
( "NP", r"((NN)/)* ((DT|CD|CC)/)* ((RB|JJ|,)/)* (((JJ)/(CC|,)/)*(JJ)/)* ((NN)/)+ ((RB|JJ)/)*"),
( "VP", r"(((MD|TO|RB)/)* ((VB)/)+ ((RP)/)* ((RB)/)*)+"),
( "VP", r"((MD)/)"),
( "PP", r"((IN|PP)/)+"),
("ADJP", r"((RB|JJ)/)* ((JJ)/,/)* ((JJ)/(CC)/)* ((JJ)/)+"),
("ADVP", r"((RB)/)+"),
]]
for i in (0, 1):
for j, (tag, s) in enumerate(CHUNKS[i]):
s = s.replace("NN", NN)
s = s.replace("VB", VB)
s = s.replace("JJ", JJ)
s = s.replace("RB", RB)
s = s.replace(" ", "")
s = re.compile(s)
CHUNKS[i][j] = (tag, s)
# Handle ADJP before VP,
# so that RB prefers next ADJP over previous VP.
CHUNKS[0].insert(1, CHUNKS[0].pop(3))
CHUNKS[1].insert(1, CHUNKS[1].pop(3))
def find_chunks(tagged, language="en"):
""" The input is a list of [token, tag]-items.
The output is a list of [token, tag, chunk]-items:
The/DT nice/JJ fish/NN is/VBZ dead/JJ ./. =>
The/DT/B-NP nice/JJ/I-NP fish/NN/I-NP is/VBZ/B-VP dead/JJ/B-ADJP ././O
"""
chunked = [x for x in tagged]
tags = "".join("%s%s" % (tag, SEPARATOR) for token, tag in tagged)
# Use Germanic or Romance chunking rules according to given language.
for tag, rule in CHUNKS[int(language in ("ca", "es", "pt", "fr", "it", "pt", "ro"))]:
for m in rule.finditer(tags):
# Find the start of chunks inside the tags-string.
# Number of preceding separators = number of preceding tokens.
i = m.start()
j = tags[:i].count(SEPARATOR)
n = m.group(0).count(SEPARATOR)
for k in range(j, j+n):
if len(chunked[k]) == 3:
continue
if len(chunked[k]) < 3:
# A conjunction or comma cannot be start of a chunk.
if k == j and chunked[k][1] in ("CC", "CJ", ","):
j += 1
# Mark first token in chunk with B-.
elif k == j:
chunked[k].append("B-" + tag)
# Mark other tokens in chunk with I-.
else:
chunked[k].append("I-" + tag)
# Mark chinks (tokens outside of a chunk) with O-.
for chink in filter(lambda x: len(x) < 3, chunked):
chink.append("O")
# Post-processing corrections.
for i, (word, tag, chunk) in enumerate(chunked):
if tag.startswith("RB") and chunk == "B-NP":
# "Perhaps you" => ADVP + NP
# "Really nice work" => NP
# "Really, nice work" => ADVP + O + NP
if i < len(chunked)-1 and not chunked[i+1][1].startswith("JJ"):
chunked[i+0][2] = "B-ADVP"
chunked[i+1][2] = "B-NP"
if i < len(chunked)-1 and chunked[i+1][1] in ("CC", "CJ", ","):
chunked[i+1][2] = "O"
if i < len(chunked)-2 and chunked[i+1][2] == "O":
chunked[i+2][2] = "B-NP"
return chunked
def find_prepositions(chunked):
""" The input is a list of [token, tag, chunk]-items.
The output is a list of [token, tag, chunk, preposition]-items.
PP-chunks followed by NP-chunks make up a PNP-chunk.
"""
# Tokens that are not part of a preposition just get the O-tag.
for ch in chunked:
ch.append("O")
for i, chunk in enumerate(chunked):
if chunk[2].endswith("PP") and chunk[-1] == "O":
# Find PP followed by other PP, NP with nouns and pronouns, VP with a gerund.
if i < len(chunked)-1 and \
(chunked[i+1][2].endswith(("NP", "PP")) or \
chunked[i+1][1] in ("VBG", "VBN")):
chunk[-1] = "B-PNP"
pp = True
for ch in chunked[i+1:]:
if not (ch[2].endswith(("NP", "PP")) or ch[1] in ("VBG", "VBN")):
break
if ch[2].endswith("PP") and pp:
ch[-1] = "I-PNP"
if not ch[2].endswith("PP"):
ch[-1] = "I-PNP"
pp = False
return chunked
#--- SEMANTIC ROLE LABELER -------------------------------------------------------------------------
# Naive approach.
BE = dict.fromkeys(("be", "am", "are", "is", "being", "was", "were", "been"), True)
GO = dict.fromkeys(("go", "goes", "going", "went"), True)
def find_relations(chunked):
""" The input is a list of [token, tag, chunk]-items.
The output is a list of [token, tag, chunk, relation]-items.
A noun phrase preceding a verb phrase is perceived as sentence subject.
A noun phrase following a verb phrase is perceived as sentence object.
"""
tag = lambda token: token[2].split("-")[-1] # B-NP => NP
# Group successive tokens with the same chunk-tag.
chunks = []
for token in chunked:
if len(chunks) == 0 \
or token[2].startswith("B-") \
or tag(token) != tag(chunks[-1][-1]):
chunks.append([])
chunks[-1].append(token+["O"])
# If a VP is preceded by a NP, the NP is tagged as NP-SBJ-(id).
# If a VP is followed by a NP, the NP is tagged as NP-OBJ-(id).
# Chunks that are not part of a relation get an O-tag.
id = 0
for i, chunk in enumerate(chunks):
if tag(chunk[-1]) == "VP" and i > 0 and tag(chunks[i-1][-1]) == "NP":
if chunk[-1][-1] == "O":
id += 1
for token in chunk:
token[-1] = "VP-" + str(id)
for token in chunks[i-1]:
token[-1] += "*NP-SBJ-" + str(id)
token[-1] = token[-1].lstrip("O-*")
if tag(chunk[-1]) == "VP" and i < len(chunks)-1 and tag(chunks[i+1][-1]) == "NP":
if chunk[-1][-1] == "O":
id += 1
for token in chunk:
token[-1] = "VP-" + str(id)
for token in chunks[i+1]:
token[-1] = "*NP-OBJ-" + str(id)
token[-1] = token[-1].lstrip("O-*")
# This is more a proof-of-concept than useful in practice:
# PP-LOC = be + in|at + the|my
# PP-DIR = go + to|towards + the|my
for i, chunk in enumerate(chunks):
if 0 < i < len(chunks)-1 and len(chunk) == 1 and chunk[-1][-1] == "O":
t0, t1, t2 = chunks[i-1][-1], chunks[i][0], chunks[i+1][0] # previous / current / next
if tag(t1) == "PP" and t2[1] in ("DT", "PR", "PRP$"):
if t0[0] in BE and t1[0] in ("in", "at") : t1[-1] = "PP-LOC"
if t0[0] in GO and t1[0] in ("to", "towards") : t1[-1] = "PP-DIR"
related = []; [related.extend(chunk) for chunk in chunks]
return related
#--- KEYWORDS EXTRACTION ---------------------------------------------------------------------------
def find_keywords(string, parser, top=10, frequency={}, **kwargs):
""" Returns a sorted list of keywords in the given string.
The given parser (e.g., pattern.en.parser) is used to identify noun phrases.
The given frequency dictionary can be a reference corpus,
with relative document frequency (df, 0.0-1.0) for each lemma,
e.g., {"the": 0.8, "cat": 0.1, ...}
"""
lemmata = kwargs.pop("lemmata", kwargs.pop("stem", True))
# Parse the string and extract noun phrases (NP).
chunks = []
wordcount = 0
for sentence in parser.parse(string, chunks=True, lemmata=lemmata).split():
for w in sentence: # ["cats", "NNS", "I-NP", "O", "cat"]
if w[2] == "B-NP":
chunks.append([w])
wordcount += 1
elif w[2] == "I-NP" and w[1][:3] == chunks[-1][-1][1][:3] == "NNP":
chunks[-1][-1][+0] += " " + w[+0] # Collapse NNPs: "Ms Kitty".
chunks[-1][-1][-1] += " " + w[-1]
elif w[2] == "I-NP":
chunks[-1].append(w)
wordcount += 1
# Rate the nouns in noun phrases.
m = {}
for i, chunk in enumerate(chunks):
head = True
if parser.language not in ("ca", "es", "pt", "fr", "it", "pt", "ro"):
# Head of "cat hair" => "hair".
# Head of "poils de chat" => "poils".
chunk = list(reversed(chunk))
for w in chunk:
if w[1].startswith("NN"):
if lemmata:
k = w[-1]
else:
k = w[0].lower()
if not k in m:
m[k] = [0.0, set(), 1.0, 1.0, 1.0]
# Higher score for chunks that appear more frequently.
m[k][0] += 1 / float(wordcount)
# Higher score for chunks that appear in more contexts (semantic centrality).
m[k][1].add(" ".join(map(lambda x: x[0], chunk)).lower())
# Higher score for chunks at the start (25%) of the text.
m[k][2] += 1 if float(i) / len(chunks) <= 0.25 else 0
# Higher score for chunks not in a prepositional phrase.
m[k][3] += 1 if w[3] == "O" else 0
# Higher score for chunk head.
m[k][4] += 1 if head else 0
head = False
# Rate tf-idf if a frequency dict is given.
for k in m:
if frequency:
df = frequency.get(k, 0.0)
df = max(df, 1e-10)
df = log(1.0 / df, 2.71828)
else:
df = 1.0
m[k][0] = max(1e-10, m[k][0] * df)
m[k][1] = 1 + float(len(m[k][1]))
# Sort candidates alphabetically by total score
# The harmonic mean will emphasize tf-idf score.
hmean = lambda a: len(a) / sum(1.0 / x for x in a)
m = [(hmean(m[k]), k) for k in m]
m = sorted(m, key=lambda x: x[1])
m = sorted(m, key=lambda x: x[0], reverse=True)
m = [k for score, k in m]
return m[:top]
#### COMMAND LINE ##################################################################################
# The commandline() function enables command line support for a Parser.
# The following code can be added to pattern.en, for example:
#
# from pattern.text import Parser, commandline
# parse = Parser(lexicon=LEXICON).parse
# if __name__ == "main":
# commandline(parse)
#
# The parser is then accessible from the command line:
# python -m pattern.en.parser xml -s "Hello, my name is Dr. Sbaitso. Nice to meet you." -OTCLI
def commandline(parse=Parser().parse):
import optparse
import codecs
p = optparse.OptionParser()
p.add_option("-f", "--file", dest="file", action="store", help="text file to parse", metavar="FILE")
p.add_option("-s", "--string", dest="string", action="store", help="text string to parse", metavar="STRING")
p.add_option("-O", "--tokenize", dest="tokenize", action="store_true", help="tokenize the input")
p.add_option("-T", "--tags", dest="tags", action="store_true", help="parse part-of-speech tags")
p.add_option("-C", "--chunks", dest="chunks", action="store_true", help="parse chunk tags")
p.add_option("-R", "--relations", dest="relations", action="store_true", help="find verb/predicate relations")
p.add_option("-L", "--lemmata", dest="lemmata", action="store_true", help="find word lemmata")
p.add_option("-e", "--encoding", dest="encoding", action="store_true", help="character encoding", default="utf-8")
p.add_option("-v", "--version", dest="version", action="store_true", help="version info")
o, arguments = p.parse_args()
# Version info.
if o.version:
sys.path.insert(0, os.path.join(MODULE, "..", ".."))
from pattern import __version__
print(__version__)
sys.path.pop(0)
# Either a text file (-f) or a text string (-s) must be supplied.
s = o.file and codecs.open(o.file, "r", o.encoding).read() or o.string
# The given text can be parsed in two modes:
# - implicit: parse everything (tokenize, tag/chunk, find relations, lemmatize).
# - explicit: define what to parse manually.
if s:
explicit = False
for option in [o.tokenize, o.tags, o.chunks, o.relations, o.lemmata]:
if option is not None: explicit=True; break
if not explicit:
a = {"encoding": o.encoding }
else:
a = {"tokenize": o.tokenize or False,
"tags": o.tags or False,
"chunks": o.chunks or False,
"relations": o.relations or False,
"lemmata": o.lemmata or False,
"encoding": o.encoding }
s = parse(s, **a)
# The output can be either slash-formatted string or XML.
if "xml" in arguments:
s = Tree(s, s.tags).xml
print(encode_utf8(s))
#### VERBS #########################################################################################
#--- VERB TENSES -----------------------------------------------------------------------------------
# Conjugation is the inflection of verbs by tense, person, number, mood and aspect.
# VERB TENSE:
INFINITIVE, PRESENT, PAST, FUTURE = \
INF, PRES, PST, FUT = \
"infinitive", "present", "past", "future"
# VERB PERSON:
# 1st person = I or we (plural).
# 2nd person = you.
# 3rd person = he, she, it or they (plural).
FIRST, SECOND, THIRD = \
1, 2, 3
# VERB NUMBER:
# singular number = I, you, he, she, it.
# plural number = we, you, they.
SINGULAR, PLURAL = \
SG, PL = \
"singular", "plural"
# VERB MOOD:
# indicative mood = a fact: "the cat meowed".
# imperative mood = a command: "meow!".
# conditional mood = a hypothesis: "a cat *will* meow *if* it is hungry".
# subjunctive mood = a wish, possibility or necessity: "I *wish* the cat *would* stop meowing".
INDICATIVE, IMPERATIVE, CONDITIONAL, SUBJUNCTIVE = \
IND, IMP, COND, SJV = \
"indicative", "imperative", "conditional", "subjunctive"
# VERB ASPECT:
# imperfective aspect = a habitual or ongoing action: "it was midnight; the cat meowed".
# perfective aspect = a momentary or completed action: "I flung my pillow at the cat".
# progressive aspect = a incomplete action in progress: "the cat was meowing".
# Note: the progressive aspect is a subtype of the imperfective aspect.
IMPERFECTIVE, PERFECTIVE, PROGRESSIVE = \
IPFV, PFV, PROG = \
"imperfective", "perfective", "progressive"
# Imperfect = past tense + imperfective aspect.
# Preterite = past tense + perfective aspect.
IMPERFECT = "imperfect"
PRETERITE = "preterite"
# Participle = present tense + progressive aspect.
PARTICIPLE, GERUND = "participle", "gerund"
# Continuous aspect ≈ progressive aspect.
CONTINUOUS = CONT = "continuous"
_ = None # prettify the table =>
# Unique index per tense (= tense + person + number + mood + aspect + negated? + aliases).
# The index is used to describe the format of the verb lexicon file.
# The aliases can be passed to Verbs.conjugate() and Tenses.__contains__().
TENSES = {
None: (None, _, _, _, _, False, (None ,)), # ENGLISH SPANISH GERMAN DUTCH FRENCH
0 : ( INF, _, _, _, _, False, ("inf" ,)), # to be ser sein zijn être
1 : (PRES, 1, SG, IND, IPFV, False, ("1sg" ,)), # I am soy bin ben suis
2 : (PRES, 2, SG, IND, IPFV, False, ("2sg" ,)), # you are eres bist bent es
3 : (PRES, 3, SG, IND, IPFV, False, ("3sg" ,)), # (s)he is es ist is est
4 : (PRES, 1, PL, IND, IPFV, False, ("1pl" ,)), # we are somos sind zijn sommes
5 : (PRES, 2, PL, IND, IPFV, False, ("2pl" ,)), # you are sois seid zijn êtes
6 : (PRES, 3, PL, IND, IPFV, False, ("3pl" ,)), # they are son sind zijn sont
7 : (PRES, _, PL, IND, IPFV, False, ( "pl" ,)), # are
8 : (PRES, _, _, IND, PROG, False, ("part" ,)), # being siendo zijnd étant
9 : (PRES, 1, SG, IND, IPFV, True, ("1sg-" ,)), # I am not
10 : (PRES, 2, SG, IND, IPFV, True, ("2sg-" ,)), # you aren't
11 : (PRES, 3, SG, IND, IPFV, True, ("3sg-" ,)), # (s)he isn't
12 : (PRES, 1, PL, IND, IPFV, True, ("1pl-" ,)), # we aren't
13 : (PRES, 2, PL, IND, IPFV, True, ("2pl-" ,)), # you aren't
14 : (PRES, 3, PL, IND, IPFV, True, ("3pl-" ,)), # they aren't
15 : (PRES, _, PL, IND, IPFV, True, ( "pl-" ,)), # aren't
16 : (PRES, _, _, IND, IPFV, True, ( "-" ,)), # isn't
17 : ( PST, 1, SG, IND, IPFV, False, ("1sgp" ,)), # I was era war was étais
18 : ( PST, 2, SG, IND, IPFV, False, ("2sgp" ,)), # you were eras warst was étais
19 : ( PST, 3, SG, IND, IPFV, False, ("3sgp" ,)), # (s)he was era war was était
20 : ( PST, 1, PL, IND, IPFV, False, ("1ppl" ,)), # we were éramos waren waren étions
21 : ( PST, 2, PL, IND, IPFV, False, ("2ppl" ,)), # you were erais wart waren étiez
22 : ( PST, 3, PL, IND, IPFV, False, ("3ppl" ,)), # they were eran waren waren étaient
23 : ( PST, _, PL, IND, IPFV, False, ( "ppl" ,)), # were
24 : ( PST, _, _, IND, PROG, False, ("ppart",)), # been sido gewesen geweest été
25 : ( PST, _, _, IND, IPFV, False, ( "p" ,)), # was
26 : ( PST, 1, SG, IND, IPFV, True, ("1sgp-",)), # I wasn't
27 : ( PST, 2, SG, IND, IPFV, True, ("2sgp-",)), # you weren't
28 : ( PST, 3, SG, IND, IPFV, True, ("3sgp-",)), # (s)he wasn't
29 : ( PST, 1, PL, IND, IPFV, True, ("1ppl-",)), # we weren't
30 : ( PST, 2, PL, IND, IPFV, True, ("2ppl-",)), # you weren't
31 : ( PST, 3, PL, IND, IPFV, True, ("3ppl-",)), # they weren't
32 : ( PST, _, PL, IND, IPFV, True, ( "ppl-",)), # weren't
33 : ( PST, _, _, IND, IPFV, True, ( "p-" ,)), # wasn't
34 : ( PST, 1, SG, IND, PFV, False, ("1sg+" ,)), # I fui fus
35 : ( PST, 2, SG, IND, PFV, False, ("2sg+" ,)), # you fuiste fus
36 : ( PST, 3, SG, IND, PFV, False, ("3sg+" ,)), # (s)he fue fut
37 : ( PST, 1, PL, IND, PFV, False, ("1pl+" ,)), # we fuimos fûmes
38 : ( PST, 2, PL, IND, PFV, False, ("2pl+" ,)), # you fuisteis fûtes
39 : ( PST, 3, PL, IND, PFV, False, ("3pl+" ,)), # they fueron furent
40 : ( FUT, 1, SG, IND, IPFV, False, ("1sgf" ,)), # I seré serai
41 : ( FUT, 2, SG, IND, IPFV, False, ("2sgf" ,)), # you serás seras
42 : ( FUT, 3, SG, IND, IPFV, False, ("3sgf" ,)), # (s)he será sera
43 : ( FUT, 1, PL, IND, IPFV, False, ("1plf" ,)), # we seremos serons
44 : ( FUT, 2, PL, IND, IPFV, False, ("2plf" ,)), # you seréis serez
45 : ( FUT, 3, PL, IND, IPFV, False, ("3plf" ,)), # they serán seron
46 : (PRES, 1, SG, COND, IPFV, False, ("1sg->",)), # I sería serais
47 : (PRES, 2, SG, COND, IPFV, False, ("2sg->",)), # you serías serais
48 : (PRES, 3, SG, COND, IPFV, False, ("3sg->",)), # (s)he sería serait
49 : (PRES, 1, PL, COND, IPFV, False, ("1pl->",)), # we seríamos serions
50 : (PRES, 2, PL, COND, IPFV, False, ("2pl->",)), # you seríais seriez
51 : (PRES, 3, PL, COND, IPFV, False, ("3pl->",)), # they serían seraient
52 : (PRES, 2, SG, IMP, IPFV, False, ("2sg!" ,)), # you sé sei sois
521: (PRES, 3, SG, IMP, IPFV, False, ("3sg!" ,)), # (s)he
53 : (PRES, 1, PL, IMP, IPFV, False, ("1pl!" ,)), # we seien soyons
54 : (PRES, 2, PL, IMP, IPFV, False, ("2pl!" ,)), # you sed seid soyez
541: (PRES, 3, PL, IMP, IPFV, False, ("3pl!" ,)), # you
55 : (PRES, 1, SG, SJV, IPFV, False, ("1sg?" ,)), # I sea sei sois
56 : (PRES, 2, SG, SJV, IPFV, False, ("2sg?" ,)), # you seas seist sois
57 : (PRES, 3, SG, SJV, IPFV, False, ("3sg?" ,)), # (s)he sea sei soit
58 : (PRES, 1, PL, SJV, IPFV, False, ("1pl?" ,)), # we seamos seien soyons
59 : (PRES, 2, PL, SJV, IPFV, False, ("2pl?" ,)), # you seáis seiet soyez
60 : (PRES, 3, PL, SJV, IPFV, False, ("3pl?" ,)), # they sean seien soient
61 : (PRES, 1, SG, SJV, PFV, False, ("1sg?+",)), # I
62 : (PRES, 2, SG, SJV, PFV, False, ("2sg?+",)), # you
63 : (PRES, 3, SG, SJV, PFV, False, ("3sg?+",)), # (s)he
64 : (PRES, 1, PL, SJV, PFV, False, ("1pl?+",)), # we
65 : (PRES, 2, PL, SJV, PFV, False, ("2pl?+",)), # you
66 : (PRES, 3, PL, SJV, PFV, False, ("3pl?+",)), # they
67 : ( PST, 1, SG, SJV, IPFV, False, ("1sgp?",)), # I fuera wäre fusse
68 : ( PST, 2, SG, SJV, IPFV, False, ("2sgp?",)), # you fueras wärest fusses
69 : ( PST, 3, SG, SJV, IPFV, False, ("3sgp?",)), # (s)he fuera wäre fût
70 : ( PST, 1, PL, SJV, IPFV, False, ("1ppl?",)), # we fuéramos wären fussions
71 : ( PST, 2, PL, SJV, IPFV, False, ("2ppl?",)), # you fuerais wäret fussiez
72 : ( PST, 3, PL, SJV, IPFV, False, ("3ppl?",)), # they fueran wären fussent
}
# Map tenses and aliases to unique index.
# Aliases include:
# - a short number: "s", "sg", "singular" => SINGULAR,
# - a short string: "1sg" => 1st person singular present,
# - a unique index: 1 => 1st person singular present,
# - Penn treebank: "VBP" => 1st person singular present.
TENSES_ID = {}
TENSES_ID[INFINITIVE] = 0
for i, (tense, person, number, mood, aspect, negated, aliases) in TENSES.items():
for a in aliases + (i,):
TENSES_ID[i] = \
TENSES_ID[a] = \
TENSES_ID[(tense, person, number, mood, aspect, negated)] = i
if number == SG:
for sg in ("s", "sg", "singular"):
TENSES_ID[(tense, person, sg, mood, aspect, negated)] = i
if number == PL:
for pl in ("p", "pl", "plural"):
TENSES_ID[(tense, person, pl, mood, aspect, negated)] = i
# Map Penn Treebank tags to unique index.
for tag, tense in (
("VB", 0 ), # infinitive
("VBP", 1 ), # present 1 singular
("VBZ", 3 ), # present 3 singular
("VBG", 8 ), # present participle
("VBN", 24), # past participle
("VBD", 25)): # past
TENSES_ID[tag.lower()] = tense
# tense(tense=INFINITIVE)
# tense(tense=(PRESENT, 3, SINGULAR))
# tense(tense=PRESENT, person=3, number=SINGULAR, mood=INDICATIVE, aspect=IMPERFECTIVE, negated=False)
def tense_id(*args, **kwargs):
""" Returns the tense id for a given (tense, person, number, mood, aspect, negated).
Aliases and compound forms (e.g., IMPERFECT) are disambiguated.
"""
# Unpack tense given as a tuple, e.g., tense((PRESENT, 1, SG)):
if len(args) == 1 and isinstance(args[0], (list, tuple)):
if args[0] not in ((PRESENT, PARTICIPLE), (PAST, PARTICIPLE)):
args = args[0]
# No parameters defaults to tense=INFINITIVE, tense=PRESENT otherwise.
if len(args) == 0 and len(kwargs) == 0:
t = INFINITIVE
else:
t = PRESENT
# Set default values.
tense = kwargs.get("tense" , args[0] if len(args) > 0 else t)
person = kwargs.get("person" , args[1] if len(args) > 1 else 3) or None
number = kwargs.get("number" , args[2] if len(args) > 2 else SINGULAR)
mood = kwargs.get("mood" , args[3] if len(args) > 3 else INDICATIVE)
aspect = kwargs.get("aspect" , args[4] if len(args) > 4 else IMPERFECTIVE)
negated = kwargs.get("negated", args[5] if len(args) > 5 else False)
# Disambiguate wrong order of parameters.
if mood in (PERFECTIVE, IMPERFECTIVE):
mood, aspect = INDICATIVE, mood
# Disambiguate INFINITIVE.
# Disambiguate PARTICIPLE, IMPERFECT, PRETERITE.
# These are often considered to be tenses but are in fact tense + aspect.
if tense == INFINITIVE:
person = number = mood = aspect = None; negated=False
if tense in ((PRESENT, PARTICIPLE), PRESENT+PARTICIPLE, PARTICIPLE, GERUND):
tense, aspect = PRESENT, PROGRESSIVE
if tense in ((PAST, PARTICIPLE), PAST+PARTICIPLE):
tense, aspect = PAST, PROGRESSIVE
if tense == IMPERFECT:
tense, aspect = PAST, IMPERFECTIVE
if tense == PRETERITE:
tense, aspect = PAST, PERFECTIVE
if aspect in (CONTINUOUS, PARTICIPLE, GERUND):
aspect = PROGRESSIVE
if aspect == PROGRESSIVE:
person = number = None
# Disambiguate CONDITIONAL.
# In Spanish, the conditional is regarded as an indicative tense.
if tense == CONDITIONAL and mood == INDICATIVE:
tense, mood = PRESENT, CONDITIONAL
# Disambiguate aliases: "pl" =>
# (PRESENT, None, PLURAL, INDICATIVE, IMPERFECTIVE, False).
return TENSES_ID.get(tense.lower(),
TENSES_ID.get((tense, person, number, mood, aspect, negated)))
tense = tense_id
#--- VERB CONJUGATIONS -----------------------------------------------------------------------------
# Verb conjugations based on a table of known verbs and rules for unknown verbs.
# Verb conjugations are useful to find the verb infinitive in the parser's lemmatizer.
# For unknown verbs, Verbs.find_lemma() and Verbs.find_lexeme() are called.
# These must be implemented in a subclass with rules for unknown verbs.
class Verbs(lazydict):
def __init__(self, path="", format=[], default={}, language=None):
""" A dictionary of verb infinitives, each linked to a list of conjugated forms.
Each line in the file at the given path is one verb, with the tenses separated by a comma.
The format defines the order of tenses (see TENSES).
The default dictionary defines default tenses for omitted tenses.
"""
self._path = path
self._language = language
self._format = dict((TENSES_ID[id], i) for i, id in enumerate(format))
self._default = default
self._inverse = {}
def load(self):
# have,,,has,,having,,,,,had,had,haven't,,,hasn't,,,,,,,hadn't,hadn't
id = self._format[TENSES_ID[INFINITIVE]]
for v in _read(self._path):
v = v.split(",")
dict.__setitem__(self, v[id], v)
for x in (x for x in v if x):
self._inverse[x] = v[id]
@property
def path(self):
return self._path
@property
def language(self):
return self._language
@property
def infinitives(self):
""" Yields a dictionary of (infinitive, [inflections])-items.
"""
if dict.__len__(self) == 0:
self.load()
return self
@property
def inflections(self):
""" Yields a dictionary of (inflected, infinitive)-items.
"""
if dict.__len__(self) == 0:
self.load()
return self._inverse
@property
def TENSES(self):
""" Yields a list of tenses for this language, excluding negations.
Each tense is a (tense, person, number, mood, aspect)-tuple.
"""
a = set(TENSES[id] for id in self._format)
a = a.union(set(TENSES[id] for id in self._default.keys()))
a = a.union(set(TENSES[id] for id in self._default.values()))
a = sorted(x[:-2] for x in a if x[-2] is False) # Exclude negation.
return a
def lemma(self, verb, parse=True):
""" Returns the infinitive form of the given verb, or None.
"""
if dict.__len__(self) == 0:
self.load()
if verb.lower() in self._inverse:
return self._inverse[verb.lower()]
if verb in self._inverse:
return self._inverse[verb]
if parse is True: # rule-based
return self.find_lemma(verb)
def lexeme(self, verb, parse=True):
""" Returns a list of all possible inflections of the given verb.
"""
a = []
b = self.lemma(verb, parse=parse)
if b in self:
a = [x for x in self[b] if x != ""]
elif parse is True: # rule-based
a = self.find_lexeme(b)
u = []; [u.append(x) for x in a if x not in u]
return u
def conjugate(self, verb, *args, **kwargs):
""" Inflects the verb and returns the given tense (or None).
For example: be
- Verbs.conjugate("is", INFINITVE) => be
- Verbs.conjugate("be", PRESENT, 1, SINGULAR) => I am
- Verbs.conjugate("be", PRESENT, 1, PLURAL) => we are
- Verbs.conjugate("be", PAST, 3, SINGULAR) => he was
- Verbs.conjugate("be", PAST, aspect=PROGRESSIVE) => been
- Verbs.conjugate("be", PAST, person=1, negated=True) => I wasn't
"""
id = tense_id(*args, **kwargs)
# Get the tense index from the format description (or a default).
i1 = self._format.get(id)
i2 = self._format.get(self._default.get(id))
i3 = self._format.get(self._default.get(self._default.get(id)))
b = self.lemma(verb, parse=kwargs.get("parse", True))
v = []
# Get the verb lexeme and return the requested index.
if b in self:
v = self[b]
for i in (i1, i2, i3):
if i is not None and 0 <= i < len(v) and v[i]:
return v[i]
if kwargs.get("parse", True) is True: # rule-based
v = self.find_lexeme(b)
for i in (i1, i2, i3):
if i is not None and 0 <= i < len(v) and v[i]:
return v[i]
def tenses(self, verb, parse=True):
""" Returns a list of possible tenses for the given inflected verb.
"""
verb = verb.lower()
a = set()
b = self.lemma(verb, parse=parse)
v = []
if b in self:
v = self[b]
elif parse is True: # rule-based
v = self.find_lexeme(b)
# For each tense in the verb lexeme that matches the given tense,
# 1) retrieve the tense tuple,
# 2) retrieve the tense tuples for which that tense is a default.
for i, tense in enumerate(v):
if tense == verb:
for id, index in self._format.items():
if i == index:
a.add(id)
for id1, id2 in self._default.items():
if id2 in a:
a.add(id1)
for id1, id2 in self._default.items():
if id2 in a:
a.add(id1)
a = (TENSES[id][:-2] for id in a)
a = Tenses(sorted(a))
return a
def find_lemma(self, verb):
# Must be overridden in a subclass.
# Must return the infinitive for the given conjugated (unknown) verb.
return verb
def find_lexeme(self, verb):
# Must be overridden in a subclass.
# Must return the list of conjugations for the given (unknown) verb.
return []
class Tenses(list):
def __contains__(self, tense):
# t in tenses(verb) also works when t is an alias (e.g. "1sg").
return list.__contains__(self, TENSES[tense_id(tense)][:-2])
### SENTIMENT POLARITY LEXICON #####################################################################
# A sentiment lexicon can be used to discern objective facts from subjective opinions in text.
# Each word in the lexicon has scores for:
# 1) polarity: negative vs. positive (-1.0 => +1.0)
# 2) subjectivity: objective vs. subjective (+0.0 => +1.0)
# 3) intensity: modifies next word? (x0.5 => x2.0)
# For English, adverbs are used as modifiers (e.g., "very good").
# For Dutch, adverbial adjectives are used as modifiers
# ("hopeloos voorspelbaar", "ontzettend spannend", "verschrikkelijk goed").
# Negation words (e.g., "not") reverse the polarity of the following word.
# Sentiment()(txt) returns an averaged (polarity, subjectivity)-tuple.
# Sentiment().assessments(txt) returns a list of (chunk, polarity, subjectivity, label)-tuples.
# Semantic labels are useful for fine-grained analysis, e.g.,
# negative words + positive emoticons could indicate cynicism.
# Semantic labels:
MOOD = "mood" # emoticons, emojis
IRONY = "irony" # sarcasm mark (!)
NOUN, VERB, ADJECTIVE, ADVERB = \
"NN", "VB", "JJ", "RB"
RE_SYNSET = re.compile(r"^[acdnrv][-_][0-9]+$")
def avg(list):
return sum(list) / float(len(list) or 1)
class Score(tuple):
def __new__(self, polarity, subjectivity, assessments=[]):
""" A (polarity, subjectivity)-tuple with an assessments property.
"""
return tuple.__new__(self, [polarity, subjectivity])
def __init__(self, polarity, subjectivity, assessments=[]):
self.assessments = assessments
class Sentiment(lazydict):
def __init__(self, path="", language=None, synset=None, confidence=None, **kwargs):
""" A dictionary of words (adjectives) and polarity scores (positive/negative).
The value for each word is a dictionary of part-of-speech tags.
The value for each word POS-tag is a tuple with values for
polarity (-1.0-1.0), subjectivity (0.0-1.0) and intensity (0.5-2.0).
"""
self._path = path # XML file path.
self._language = None # XML language attribute ("en", "fr", ...)
self._confidence = None # XML confidence attribute threshold (>=).
self._synset = synset # XML synset attribute ("wordnet_id", "cornetto_id", ...)
self._synsets = {} # {"a-01123879": (1.0, 1.0, 1.0)}
self.labeler = {} # {"dammit": "profanity"}
self.tokenizer = kwargs.get("tokenizer", find_tokens)
self.negations = kwargs.get("negations", ("no", "not", "n't", "never"))
self.modifiers = kwargs.get("modifiers", ("RB",))
self.modifier = kwargs.get("modifier" , lambda w: w.endswith("ly"))
@property
def path(self):
return self._path
@property
def language(self):
return self._language
@property
def confidence(self):
return self._confidence
def load(self, path=None):
""" Loads the XML-file (with sentiment annotations) from the given path.
By default, Sentiment.path is lazily loaded.
"""
# <word form="great" wordnet_id="a-01123879" pos="JJ" polarity="1.0" subjectivity="1.0" intensity="1.0" />
# <word form="damnmit" polarity="-0.75" subjectivity="1.0" label="profanity" />
if not path:
path = self._path
if not os.path.exists(path):
return
words, synsets, labels = {}, {}, {}
xml = cElementTree.parse(path)
xml = xml.getroot()
for w in xml.findall("word"):
if self._confidence is None \
or self._confidence <= float(w.attrib.get("confidence", 0.0)):
w, pos, p, s, i, label, synset = (
w.attrib.get("form"),
w.attrib.get("pos"),
w.attrib.get("polarity", 0.0),
w.attrib.get("subjectivity", 0.0),
w.attrib.get("intensity", 1.0),
w.attrib.get("label"),
w.attrib.get(self._synset) # wordnet_id, cornetto_id, ...
)
psi = (float(p), float(s), float(i))
if w:
words.setdefault(w, {}).setdefault(pos, []).append(psi)
if w and label:
labels[w] = label
if synset:
synsets.setdefault(synset, []).append(psi)
self._language = xml.attrib.get("language", self._language)
# Average scores of all word senses per part-of-speech tag.
for w in words:
words[w] = dict((pos, [avg(each) for each in zip(*psi)]) for pos, psi in words[w].items())
# Average scores of all part-of-speech tags.
for w, pos in list(words.items()):
words[w][None] = [avg(each) for each in zip(*pos.values())]
# Average scores of all synonyms per synset.
for id, psi in synsets.items():
synsets[id] = [avg(each) for each in zip(*psi)]
dict.update(self, words)
dict.update(self.labeler, labels)
dict.update(self._synsets, synsets)
[docs] def synset(self, id, pos=ADJECTIVE):
""" Returns a (polarity, subjectivity)-tuple for the given synset id.
For example, the adjective "horrible" has id 193480 in WordNet:
Sentiment.synset(193480, pos="JJ") => (-0.6, 1.0, 1.0).
"""
id = str(id).zfill(8)
if not id.startswith(("n-", "v-", "a-", "r-")):
if pos == NOUN:
id = "n-" + id
if pos == VERB:
id = "v-" + id
if pos == ADJECTIVE:
id = "a-" + id
if pos == ADVERB:
id = "r-" + id
if dict.__len__(self) == 0:
self.load()
try:
return tuple(self._synsets[id])[:2]
except KeyError: # Some WordNet id's are not zero padded.
return tuple(self._synsets.get(re.sub(r"-0+", "-", id), (0.0, 0.0))[:2])
def __call__(self, s, negation=True, **kwargs):
""" Returns a (polarity, subjectivity)-tuple for the given sentence,
with polarity between -1.0 and 1.0 and subjectivity between 0.0 and 1.0.
The sentence can be a string, Synset, Text, Sentence, Chunk, Word, Document, Vector.
An optional weight parameter can be given,
as a function that takes a list of words and returns a weight.
"""
def avg(assessments, weighted=lambda w: 1):
s, n = 0, 0
for words, score in assessments:
w = weighted(words)
s += w * score
n += w
return s / float(n or 1)
# A pattern.en.wordnet.Synset.
# Sentiment(synsets("horrible", "JJ")[0]) => (-0.6, 1.0)
if hasattr(s, "gloss"):
a = [(s.synonyms[0],) + self.synset(s.id, pos=s.pos) + (None,)]
# A synset id.
# Sentiment("a-00193480") => horrible => (-0.6, 1.0) (English WordNet)
# Sentiment("c_267") => verschrikkelijk => (-0.9, 1.0) (Dutch Cornetto)
elif isinstance(s, basestring) and RE_SYNSET.match(s):
a = [(s.synonyms[0],) + self.synset(s.id, pos=s.pos) + (None,)]
# A string of words.
# Sentiment("a horrible movie") => (-0.6, 1.0)
elif isinstance(s, basestring):
a = self.assessments(((w.lower(), None) for w in " ".join(self.tokenizer(s)).split()), negation)
# A pattern.en.Text.
elif hasattr(s, "sentences"):
a = self.assessments(((w.lemma or w.string.lower(), w.pos[:2]) for w in chain(*s)), negation)
# A pattern.en.Sentence or pattern.en.Chunk.
elif hasattr(s, "lemmata"):
a = self.assessments(((w.lemma or w.string.lower(), w.pos[:2]) for w in s.words), negation)
# A pattern.en.Word.
elif hasattr(s, "lemma"):
a = self.assessments(((s.lemma or s.string.lower(), s.pos[:2]),), negation)
# A pattern.vector.Document.
# Average score = weighted average using feature weights.
# Bag-of words is unordered: inject None between each two words
# to stop assessments() from scanning for preceding negation & modifiers.
elif hasattr(s, "terms"):
a = self.assessments(chain(*(((w, None), (None, None)) for w in s)), negation)
kwargs.setdefault("weight", lambda w: s.terms[w[0]])
# A dict of (word, weight)-items.
elif isinstance(s, dict):
a = self.assessments(chain(*(((w, None), (None, None)) for w in s)), negation)
kwargs.setdefault("weight", lambda w: s[w[0]])
# A list of words.
elif isinstance(s, list):
a = self.assessments(((w, None) for w in s), negation)
else:
a = []
weight = kwargs.get("weight", lambda w: 1)
# Each "w" in "a" is a (words, polarity, subjectivity, label)-tuple.
return Score(polarity = avg(map(lambda w: (w[0], w[1]), a), weight),
subjectivity = avg(map(lambda w: (w[0], w[2]), a), weight),
assessments = a)
[docs] def assessments(self, words=[], negation=True):
""" Returns a list of (chunk, polarity, subjectivity, label)-tuples for the given list of words:
where chunk is a list of successive words: a known word optionally
preceded by a modifier ("very good") or a negation ("not good").
"""
a = []
m = None # Preceding modifier (i.e., adverb or adjective).
n = None # Preceding negation (e.g., "not beautiful").
for w, pos in words:
# Only assess known words, preferably by part-of-speech tag.
# Including unknown words (polarity 0.0 and subjectivity 0.0) lowers the average.
if w is None:
continue
if w in self and pos in self[w]:
p, s, i = self[w][pos]
# Known word not preceded by a modifier ("good").
if m is None:
a.append(dict(w=[w], p=p, s=s, i=i, n=1, x=self.labeler.get(w)))
# Known word preceded by a modifier ("really good").
if m is not None:
a[-1]["w"].append(w)
a[-1]["p"] = max(-1.0, min(p * a[-1]["i"], +1.0))
a[-1]["s"] = max(-1.0, min(s * a[-1]["i"], +1.0))
a[-1]["i"] = i
a[-1]["x"] = self.labeler.get(w)
# Known word preceded by a negation ("not really good").
if n is not None:
a[-1]["w"].insert(0, n)
a[-1]["i"] = 1.0 / a[-1]["i"]
a[-1]["n"] = -1
# Known word may be a negation.
# Known word may be modifying the next word (i.e., it is a known adverb).
m = None
n = None
if pos and pos in self.modifiers or any(map(self[w].__contains__, self.modifiers)):
m = (w, pos)
if negation and w in self.negations:
n = w
else:
# Unknown word may be a negation ("not good").
if negation and w in self.negations:
n = w
# Unknown word. Retain negation across small words ("not a good").
elif n and len(w.strip("'")) > 1:
n = None
# Unknown word may be a negation preceded by a modifier ("really not good").
if n is not None and m is not None and (pos in self.modifiers or self.modifier(m[0])):
a[-1]["w"].append(n)
a[-1]["n"] = -1
n = None
# Unknown word. Retain modifier across small words ("really is a good").
elif m and len(w) > 2:
m = None
# Exclamation marks boost previous word.
if w == "!" and len(a) > 0:
a[-1]["w"].append("!")
a[-1]["p"] = max(-1.0, min(a[-1]["p"] * 1.25, +1.0))
# Exclamation marks in parentheses indicate sarcasm.
if w == "(!)":
a.append(dict(w=[w], p=0.0, s=1.0, i=1.0, n=1, x=IRONY))
# EMOTICONS: {("grin", +1.0): set((":-D", ":D"))}
if w.isalpha() is False and len(w) <= 5 and w not in PUNCTUATION: # speedup
for (type, p), e in EMOTICONS.items():
if w in map(lambda e: e.lower(), e):
a.append(dict(w=[w], p=p, s=1.0, i=1.0, n=1, x=MOOD))
break
for i in range(len(a)):
w = a[i]["w"]
p = a[i]["p"]
s = a[i]["s"]
n = a[i]["n"]
x = a[i]["x"]
# "not good" = slightly bad, "not bad" = slightly good.
a[i] = (w, p * -0.5 if n < 0 else p, s, x)
return a
[docs] def annotate(self, word, pos=None, polarity=0.0, subjectivity=0.0, intensity=1.0, label=None):
""" Annotates the given word with polarity, subjectivity and intensity scores,
and optionally a semantic label (e.g., MOOD for emoticons, IRONY for "(!)").
"""
w = self.setdefault(word, {})
w[pos] = w[None] = (polarity, subjectivity, intensity)
if label:
self.labeler[word] = label
#### SPELLING CORRECTION ###########################################################################
# Based on: Peter Norvig, "How to Write a Spelling Corrector", http://norvig.com/spell-correct.html
class Spelling(lazydict):
ALPHA = "abcdefghijklmnopqrstuvwxyz"
def __init__(self, path=""):
self._path = path
def load(self):
for x in _read(self._path):
x = x.split()
dict.__setitem__(self, x[0], int(x[1]))
@property
def path(self):
return self._path
@property
def language(self):
return self._language
@classmethod
def train(self, s, path="spelling.txt"):
""" Counts the words in the given string and saves the probabilities at the given path.
This can be used to generate a new model for the Spelling() constructor.
"""
model = {}
for w in re.findall("[a-z]+", s.lower()):
model[w] = w in model and model[w] + 1 or 1
model = ("%s %s" % (k, v) for k, v in sorted(model.items()))
model = "\n".join(model)
f = open(path, "w")
f.write(model)
f.close()
def _edit1(self, w):
""" Returns a set of words with edit distance 1 from the given word.
"""
# Of all spelling errors, 80% is covered by edit distance 1.
# Edit distance 1 = one character deleted, swapped, replaced or inserted.
split = [(w[:i], w[i:]) for i in range(len(w) + 1)]
delete, transpose, replace, insert = (
[a + b[1:] for a, b in split if b],
[a + b[1] + b[0] + b[2:] for a, b in split if len(b) > 1],
[a + c + b[1:] for a, b in split for c in Spelling.ALPHA if b],
[a + c + b[0:] for a, b in split for c in Spelling.ALPHA]
)
return set(delete + transpose + replace + insert)
def _edit2(self, w):
""" Returns a set of words with edit distance 2 from the given word
"""
# Of all spelling errors, 99% is covered by edit distance 2.
# Only keep candidates that are actually known words (20% speedup).
return set(e2 for e1 in self._edit1(w) for e2 in self._edit1(e1) if e2 in self)
def _known(self, words=[]):
""" Returns the given list of words filtered by known words.
"""
return set(w for w in words if w in self)
def suggest(self, w):
""" Return a list of (word, confidence) spelling corrections for the given word,
based on the probability of known words with edit distance 1-2 from the given word.
"""
if len(self) == 0:
self.load()
if len(w) == 1:
return [(w, 1.0)] # I
if w in PUNCTUATION:
return [(w, 1.0)] # .?!
if w.replace(".", "").isdigit():
return [(w, 1.0)] # 1.5
candidates = self._known([w]) \
or self._known(self._edit1(w)) \
or self._known(self._edit2(w)) \
or [w]
candidates = [(self.get(c, 0.0), c) for c in candidates]
s = float(sum(p for p, w in candidates) or 1)
candidates = sorted(((p / s, w) for p, w in candidates), reverse=True)
candidates = [(w.istitle() and x.title() or x, p) for p, x in candidates] # case-sensitive
return candidates
#### MULTILINGUAL ##################################################################################
# The default functions in each language submodule, with an optional language parameter:
# from pattern.text import parse
# print(parse("The cat sat on the mat.", language="en"))
# print(parse("De kat zat op de mat.", language="nl"))
LANGUAGES = ["en", "es", "de", "fr", "it", "nl"]
_modules = {}
def _module(language):
""" Returns the given language module (e.g., "en" => pattern.en).
"""
return _modules.setdefault(language, __import__(language, globals(), {}, [], -1))
def _multilingual(function, *args, **kwargs):
""" Returns the value from the function with the given name in the given language module.
By default, language="en".
"""
return getattr(_module(kwargs.pop("language", "en")), function)(*args, **kwargs)
def language(s):
""" Returns a (language, confidence)-tuple for the given string.
"""
s = decode_utf8(s)
s = set(w.strip(PUNCTUATION) for w in s.replace("'", "' ").split())
n = float(len(s) or 1)
p = {}
for xx in LANGUAGES:
lexicon = _module(xx).__dict__["lexicon"]
p[xx] = sum(1 for w in s if w in lexicon) / n
return max(p.items(), key=lambda kv: (kv[1], int(kv[0] == "en")))
lang = language
def tokenize(*args, **kwargs):
return _multilingual("tokenize", *args, **kwargs)
def parse(*args, **kwargs):
return _multilingual("parse", *args, **kwargs)
def parsetree(*args, **kwargs):
return _multilingual("parsetree", *args, **kwargs)
def split(*args, **kwargs):
return _multilingual("split", *args, **kwargs)
def tag(*args, **kwargs):
return _multilingual("tag", *args, **kwargs)
def keywords(*args, **kwargs):
return _multilingual("keywords", *args, **kwargs)
def suggest(*args, **kwargs):
return _multilingual("suggest", *args, **kwargs)
def sentiment(*args, **kwargs):
return _multilingual("sentiment", *args, **kwargs)
def singularize(*args, **kwargs):
return _multilingual("singularize", *args, **kwargs)
def pluralize(*args, **kwargs):
return _multilingual("pluralize", *args, **kwargs)
def conjugate(*args, **kwargs):
return _multilingual("conjugate", *args, **kwargs)
def predicative(*args, **kwargs):
return _multilingual("predicative", *args, **kwargs)
def suggest(*args, **kwargs):
return _multilingual("suggest", *args, **kwargs)