Source code for textblob_de.parsers

# -*- coding: utf-8 -*-
#
# Code imported from the main `TextBlob`_ library.
#
# :repo: `https://github.com/sloria/TextBlob`_
# :source: textblob/en/parsers.py
# :version: 2013-10-21 (a88e86a76a)
#
# :modified: 2014-08-04 <m.killer@langui.ch>
#
"""Default parsers for German.

>>> from textblob_de.parsers import PatternParser

or

>>> from textblob_de import PatternParser

"""
from __future__ import absolute_import
import string

from textblob.base import BaseParser

from textblob_de.packages import pattern_de
from textblob_de.tokenizers import PatternTokenizer

pattern_pprint = pattern_de.pprint
pattern_parse = pattern_de.parse
pattern_parsetree = pattern_de.parsetree

PUNCTUATION = string.punctuation


[docs]class PatternParser(BaseParser): '''Parser that uses the implementation in Tom de Smedt's pattern library. http://www.clips.ua.ac.be/pages/pattern-de#parser :param tokenizer: (optional) A tokenizer instance. If ``None``, defaults to :class:`PatternTokenizer() <textblob_de.tokenizers.PatternTokenizer>`. :param tokenize: (optional) Split punctuation marks from words? (Default ``True``) :param pprint: (optional) Use ``pattern``'s ``pprint`` function to display parse trees (Default ``False``) :param tags: (optional) Parse part-of-speech tags? (NN, JJ, ...) (Default ``True``) :param chunks: (optional) Parse chunks? (NP, VP, PNP, ...) (Default ``True``) :param relations: (optional) Parse chunk relations? (-SBJ, -OBJ, ...) (Default ``False``) :param lemmata: (optional) Parse lemmata? (schönes => schön) (Default ``False``) :param encoding: (optional) Input string encoding. (Default ``utf-8``) :param tagset: (optional) Penn Treebank II (default) or ('penn'|'universal'|'stts'). ''' def __init__( self, tokenizer=None, tokenize=True, pprint=False, tags=True, chunks=True, relations=False, lemmata=False, encoding='utf-8', tagset=None): self.tokenizer = tokenizer if tokenizer is not None else PatternTokenizer() self.pprint = pprint self.tokenize = tokenize self.tags = tags self.chunks = chunks self.relations = relations self.lemmata = lemmata self.encoding = encoding self.tagset = tagset if tagset is not None else None
[docs] def parse(self, text): """Parses the text. ``pattern.de.parse(**kwargs)`` can be passed to the parser instance and are documented in the main docstring of :class:`PatternParser() <textblob_de.parsers.PatternParser>`. :param str text: A string. """ #: Do not process empty strings (Issue #3) if text.strip() == "": return "" #: Do not process strings consisting of a single punctuation mark (Issue #4) elif text.strip() in PUNCTUATION: _sym = text.strip() if _sym in tuple('.?!'): _tag = "." else: _tag = _sym if self.lemmata: return "{0}/{1}/O/O/{0}".format(_sym, _tag) else: return "{0}/{1}/O/O".format(_sym, _tag) if self.tokenize: _tokenized = " ".join(self.tokenizer.tokenize(text)) else: _tokenized = text _parsed = pattern_parse(_tokenized, # text is tokenized before it is passed on to # pattern.de.parse tokenize=False, tags=self.tags, chunks=self.chunks, relations=self.relations, lemmata=self.lemmata, encoding=self.encoding, tagset=self.tagset) if self.pprint: _parsed = pattern_pprint(_parsed) return _parsed
[docs] def parsetree(self, text): """Returns a parsed ``pattern`` Text object from the given string.""" #: Do not process empty strings (Issue #3) if text.strip() == "": return "" #: Do not process strings consisting of a single punctuation mark (Issue #4) elif text.strip() in PUNCTUATION: _sym = text.strip() if _sym in tuple('.?!'): _tag = "." else: _tag = _sym if self.lemmata: return "{0}/{1}/O/O/{0}".format(_sym, _tag) else: return "{0}/{1}/O/O".format(_sym, _tag) if self.tokenize: _tokenized = " ".join(self.tokenizer.tokenize(text)) else: _tokenized = text print(_tokenized) _parsed = pattern_parsetree(_tokenized, # text is tokenized before it is passed on # to pattern.de.parsetree tokenize=False, tags=self.tags, chunks=self.chunks, relations=self.relations, lemmata=self.lemmata, encoding=self.encoding, tagset=self.tagset) if self.pprint: _parsed = pattern_pprint(_parsed) return _parsed