Source code for textblob_de.taggers

# -*- coding: utf-8 -*-
#
# Code adapted from ``textblob-fr`` sample extension.
#
# :repo: `https://github.com/sloria/textblob-fr`_
# :source: textblob_fr/taggers.py
# :version: 2013-10-28 (5c6329d209)
#
# :modified: 2014-08-04 <m.killer@langui.ch>
#
"""Default taggers for German.

>>> from textblob_de.taggers import PatternTagger

or

>>> from textblob_de import PatternTagger

"""

from __future__ import absolute_import
import string

from textblob.base import BaseTagger
from textblob.utils import PUNCTUATION_REGEX

from textblob_de.packages import pattern_de

from textblob_de.compat import unicode
from textblob_de.tokenizers import PatternTokenizer

pattern_tag = pattern_de.tag
PUNCTUATION = string.punctuation


[docs]class PatternTagger(BaseTagger): '''Tagger that uses the implementation in Tom de Smedt's pattern library (http://www.clips.ua.ac.be/pattern). :param tokenizer: (optional) A tokenizer instance. If ``None``, defaults to :class:`PatternTokenizer() <textblob_de.tokenizers.PatternTokenizer>`. :param include_punc: (optional) whether to include punctuation as separate tokens. Default to ``False``. :param encoding: (optional) Input string encoding. (Default ``utf-8``) :param tagset: (optional) Penn Treebank II (default) or ('penn'|'universal'|'stts'). ''' def __init__(self, tokenizer=None, include_punc=False, encoding='utf-8', tagset=None): self.tokenizer = tokenizer if tokenizer is not None else PatternTokenizer() self.include_punc = include_punc self.encoding = encoding self.tagset = tagset if tagset is not None else None
[docs] def tag(self, sentence, tokenize=True): """Tag a string `sentence`. :param str or list sentence: A string or a list of sentence strings. :param tokenize: (optional) If ``False`` string has to be tokenized before (space separated string). """ #: Do not process empty strings (Issue #3) if sentence.strip() == "": return [] #: Do not process strings consisting of a single punctuation mark (Issue #4) elif sentence.strip() in PUNCTUATION: if self.include_punc: _sym = sentence.strip() if _sym in tuple('.?!'): _tag = "." else: _tag = _sym return [(_sym, _tag)] else: return [] if tokenize: _tokenized = " ".join(self.tokenizer.tokenize(sentence)) sentence = _tokenized # Sentence is tokenized before it is passed on to pattern.de.tag # (i.e. it is either submitted tokenized or if ) _tagged = pattern_tag(sentence, tokenize=False, encoding=self.encoding, tagset=self.tagset) if self.include_punc: return _tagged else: _tagged = [ (word, t) for word, t in _tagged if not PUNCTUATION_REGEX.match( unicode(t))] return _tagged