Source code for textblob_de.blob

# -*- coding: utf-8 -*-
# Code adapted from the main `TextBlob`_ library.
#
# :repo: `https://github.com/sloria/TextBlob`_
# :source: textblob/blob.py
# :version: 2013-10-21 (a88e86a76a)
#
# :modified: 2014-09-17 <m.killer@langui.ch>
#
"""Wrappers for various units of text.

This includes the main :class:`TextBlobDE <textblob_de.blob.TextBlobDE>`,
:class:`Word <textblob_de.blob.Word>`, and :class:`WordList <textblob_de.blob.WordList>` classes.

Whenever possible, classes are inherited from the main `TextBlob`_ library, but in many
cases, the models for German have to be initialised here in :py:mod:`textblob_de.blob`, resulting
in a lot of duplicate code. The main reason are the :class:`Word <textblob_de.blob.Word>` objects.
If they are generated from an inherited class, they will use the English models
(e.g. for ``pluralize``/``singularize``) used in the main library.

Example usage: ::

    >>> from textblob_de import TextBlobDE
    >>> b = TextBlobDE("Einfach ist besser als kompliziert.")
    >>> b.tags
    [('Einfach', 'RB'), ('ist', 'VB'), ('besser', 'RB'), ('als', 'IN'), ('kompliziert', 'JJ')]
    >>> b.noun_phrases
    WordList([])
    >>> b.words
    WordList(['Einfach', 'ist', 'besser', 'als', 'kompliziert'])

.. _TextBlob: http://textblob.readthedocs.org/

"""
from __future__ import absolute_import

import json
import sys

from collections import defaultdict, namedtuple

from textblob.blob import _initialize_models
from textblob.decorators import cached_property, requires_nltk_corpus
from textblob.translate import Translator
from textblob.utils import lowerstrip

from textblob_de.base import BaseBlob as _BaseBlob
from textblob_de.compat import unicode, basestring
from textblob_de.tokenizers import NLTKPunktTokenizer
from textblob_de.tokenizers import word_tokenize, sent_tokenize
from textblob_de.taggers import PatternTagger
from textblob_de.packages import pattern_de
from textblob_de.parsers import PatternParser
from textblob_de.np_extractors import PatternParserNPExtractor
from textblob_de.lemmatizers import PatternParserLemmatizer
from textblob_de.sentiments import PatternAnalyzer

_singularize = pattern_de.inflect.singularize
_pluralize = pattern_de.inflect.pluralize


[docs]class Word(unicode):

    """A simple word representation.

    Includes methods for inflection, translation, and WordNet
    integration.

    """

    translator = Translator()

    def __new__(cls, string, pos_tag=None):
        """Return a new instance of the class.

        It is necessary to override this method in order to handle the
        extra pos_tag argument in the constructor.

        """
        return super(Word, cls).__new__(cls, string)

    def __init__(self, string, pos_tag=None):
        self.string = string
        self.pos_tag = pos_tag

    def __repr__(self):
        return repr(self.string)

    def __str__(self):
        return self.string

[docs]    def singularize(self):
        """Return the singular version of the word as a string."""
        return Word(_singularize(self.string))

[docs]    def pluralize(self):
        """Return the plural version of the word as a string."""
        return Word(_pluralize(self.string))

[docs]    def translate(self, from_lang=None, to="de"):
        """Translate the word to another language using Google's Translate API.

        .. versionadded:: 0.5.0 (``textblob``)

        """
        if from_lang is None:
            from_lang = self.translator.detect(self.string)
        return self.translator.translate(self.string,
                                         from_lang=from_lang, to_lang=to)

[docs]    def detect_language(self):
        """Detect the word's language using Google's Translate API.

        .. versionadded:: 0.5.0 (``textblob``)

        """
        return self.translator.detect(self.string)

[docs]    def spellcheck(self):
        """Return a list of (word, confidence) tuples of spelling corrections.

        Based on: Peter Norvig, "How to Write a Spelling Corrector"
        (http://norvig.com/spell-correct.html) as implemented in the pattern
        library.

        .. versionadded:: 0.6.0 (``textblob``)

        """
        # return suggest(self.string)
        raise NotImplementedError

[docs]    def correct(self):
        """Correct the spelling of the word. Returns the word with the highest
        confidence using the spelling corrector.

        .. versionadded:: 0.6.0 (``textblob``)

        """
        # return Word(self.spellcheck()[0][0])
        raise NotImplementedError

    @cached_property
    def lemma(self):
        """Return the lemma of this word using Wordnet's morphy function."""
        #tag = _penn_to_wordnet(self.pos_tag) if (self.pos_tag is not None) else None
        # return self.lemmatize(pos=tag)
        raise NotImplementedError

[docs]    @requires_nltk_corpus
    def lemmatize(self, pos=None):
        """Return the lemma for a word using WordNet's morphy function.

        :param pos: Part of speech to filter upon. If `None`, defaults to
            ``_wordnet.NOUN``.

        .. versionadded:: 0.8.1 (``textblob``)

        """
        # if pos is None:
        #pos = _wordnet.NOUN
        #lemmatizer = nltk.stem.WordNetLemmatizer()
        # return lemmatizer.lemmatize(self.string, pos)
        raise NotImplementedError

    @cached_property
    def synsets(self):
        """The list of Synset objects for this Word.

        :rtype: list of Synsets

        .. versionadded:: 0.7.0 (``textblob``)

        """
        # return self.get_synsets(pos=None)
        raise NotImplementedError

    @cached_property
    def definitions(self):
        """The list of definitions for this word. Each definition corresponds
        to a synset.

        .. versionadded:: 0.7.0 (``textblob``)

        """
        # return self.define(pos=None)
        raise NotImplementedError

[docs]    def get_synsets(self, pos=None):
        """Return a list of Synset objects for this word.

        :param pos: A part-of-speech tag to filter upon. If ``None``, all
            synsets for all parts of speech will be loaded.

        :rtype: list of Synsets

        .. versionadded:: 0.7.0 (``textblob``)

        """
        # return _wordnet.synsets(self.string, pos)
        raise NotImplementedError

[docs]    def define(self, pos=None):
        """Return a list of definitions for this word. Each definition
        corresponds to a synset for this word.

        :param pos: A part-of-speech tag to filter upon. If ``None``, definitions
            for all parts of speech will be loaded.
        :rtype: List of strings

        .. versionadded:: 0.7.0 (``textblob``)

        """
        # return [syn.definition for syn in self.get_synsets(pos=pos)]
        raise NotImplementedError


# Cannot inherit from textblob.blob.WordList, otherwise the
# properties of Word() will use the English models
[docs]class WordList(list):

    """A list-like collection of words."""

    def __init__(self, collection):
        """Initialize a WordList.

        Takes a collection of strings as its only argument.

        """
        self._collection = [Word(w) for w in collection]
        super(WordList, self).__init__(self._collection)

    def __str__(self):
        return str(self._collection)

    def __repr__(self):
        """Returns a string representation for debugging."""
        class_name = self.__class__.__name__
        return '{cls}({lst})'.format(
            cls=class_name, lst=repr(self._collection))

    def __getitem__(self, key):
        """Returns a string at the given index."""
        if isinstance(key, slice):
            return self.__class__(self._collection[key])
        else:
            return self._collection[key]

    def __getslice__(self, i, j):
        # This is included for Python 2.* compatibility
        return self.__class__(self._collection[i:j])

    def __iter__(self):
        return iter(self._collection)

[docs]    def count(self, strg, case_sensitive=False, *args, **kwargs):
        """Get the count of a word or phrase `s` within this WordList.

        :param strg: The string to count.
        :param case_sensitive: A boolean, whether or not the search is case-sensitive.

        """
        if not case_sensitive:
            return [word.lower() for word in self].count(strg.lower(), *args,
                                                         **kwargs)
        return self._collection.count(strg, *args, **kwargs)

[docs]    def append(self, obj):
        """Append an object to end. If the object is a string, appends a.

        :class:`Word <Word>` object.

        """
        if isinstance(obj, basestring):
            return self._collection.append(Word(obj))
        else:
            return self._collection.append(obj)

[docs]    def extend(self, iterable):
        """Extend WordList by appending elements from ``iterable``.

        If an element
        is a string, appends a :class:`Word <Word>` object.

        """
        [self._collection.append(Word(e) if isinstance(e, basestring) else e)
            for e in iterable]
        return self

[docs]    def upper(self):
        """Return a new WordList with each word upper-cased."""
        return self.__class__([word.upper() for word in self])

[docs]    def lower(self):
        """Return a new WordList with each word lower-cased."""
        return self.__class__([word.lower() for word in self])

[docs]    def singularize(self):
        """Return the single version of each word in this WordList."""
        return self.__class__([word.singularize() for word in self])

[docs]    def pluralize(self):
        """Return the plural version of each word in this WordList."""
        return self.__class__([word.pluralize() for word in self])

[docs]    def lemmatize(self):
        """Return the lemma of each word in this WordList.

        Currently using NLTKPunktTokenizer() for all lemmatization
        tasks. This might cause slightly different tokenization results
        compared to the TextBlob.words property.

        """
        _lemmatizer = PatternParserLemmatizer(tokenizer=NLTKPunktTokenizer())
        # WordList object --> Sentence.string
        # add a period (improves parser accuracy)
        _raw = " ".join(self) + "."
        _lemmas = _lemmatizer.lemmatize(_raw)
        return self.__class__([Word(l, t) for l, t in _lemmas])


[docs]class BaseBlob(_BaseBlob):

    '''``BaseBlob`` class initialised with German default models:


    An abstract base class that all textblob classes will inherit from.
    Includes words, POS tag, NP, and word count properties. Also includes
    basic dunder and string methods for making objects like Python strings.

    :param str text: A string.
    :param tokenizer: (optional) A tokenizer instance. If ``None``, defaults to
        :class:`NLTKPunktTokenizer() <textblob_de.tokenizers.NLTKPunktTokenizer>`.
    :param np_extractor: (optional) An NPExtractor instance. If ``None``,
        defaults to :class:`PatternParserNPExtractor()
        <textblob_de.np_extractors.PatternParserNPExtractor>`.
    :param pos_tagger: (optional) A Tagger instance. If ``None``, defaults to
        :class:`PatternTagger <textblob_de.taggers.PatternTagger>`.
    :param analyzer: (optional) A sentiment analyzer. If ``None``, defaults to
        :class:`PatternAnalyzer <textblob_de.sentiments.PatternAnalyzer>`.
    :param classifier: (optional) A classifier.

    .. versionchanged:: 0.6.0
        ``clean_html`` parameter deprecated, as it was in NLTK.
    '''

    def __init__(self, text, tokenizer=None,
                 pos_tagger=None,
                 np_extractor=None,
                 analyzer=None,
                 parser=None,
                 classifier=None, clean_html=False):

        self.tokenizer = tokenizer if tokenizer is not None else NLTKPunktTokenizer()
        self.pos_tagger = pos_tagger if pos_tagger is not None else PatternTagger(
            tokenizer=self.tokenizer)
        self.np_extractor = np_extractor if np_extractor is not None \
            else PatternParserNPExtractor(tokenizer=self.tokenizer)
        self.analyzer = analyzer if analyzer is not None \
            else PatternAnalyzer(tokenizer=self.tokenizer)
        self.parser = parser if parser is not None \
            else PatternParser(tokenizer=self.tokenizer)
        self.classifier = classifier if classifier is not None else None

        if not isinstance(text, basestring):
            raise TypeError('The `text` argument passed to `__init__(text)` '
                            'must be a string, not {0}'.format(type(text)))
        if clean_html:
            raise NotImplementedError(
                "clean_html has been deprecated. "
                "To remove HTML markup, use BeautifulSoup's "
                "get_text() function")
        self.raw = self.string = text
        self.stripped = lowerstrip(self.raw, all=True)
        _initialize_models(
            self,
            self.tokenizer,
            self.pos_tagger,
            self.np_extractor,
            self.analyzer,
            self.parser,
            self.classifier)

    @cached_property
    def words(self):
        """Return a list of word tokens. This excludes punctuation characters.
        If you want to include punctuation characters, access the ``tokens``
        property.

        :returns: A :class:`WordList <WordList>` of word tokens.

        """
        return WordList(
            word_tokenize(
                self.raw,
                tokenizer=self.tokenizer,
                include_punc=False))

    @cached_property
    def tokens(self):
        '''Return a list of tokens, using this blob's tokenizer object
        (defaults to :class:`WordTokenizer <textblob.tokenizers.WordTokenizer>`).
        '''
        return WordList(self.tokenizer.tokenize(self.raw))

[docs]    def tokenize(self, tokenizer=None):
        """Return a list of tokens, using ``tokenizer``.

        :param tokenizer: (optional) A tokenizer object. If None, defaults to
            this blob's default tokenizer.

        """
        t = tokenizer if tokenizer is not None else self.tokenizer
        return WordList(t.tokenize(self.raw))

    @cached_property
    def sentiment(self):
        """Return a tuple of form (polarity, subjectivity ) where polarity
        is a float within the range [-1.0, 1.0] and subjectivity is a float
        within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is
        very subjective.

        :rtype: namedtuple of the form ``Sentiment(polarity, subjectivity)``
        """
        return self.analyzer.analyze(self.raw)

    @cached_property
    def sentiment_assessments(self):
        """Return a tuple of form (polarity, subjectivity, assessments ) where
        polarity is a float within the range [-1.0, 1.0], subjectivity is a
        float within the range [0.0, 1.0] where 0.0 is very objective and 1.0
        is very subjective, and assessments is a list of polarity and
        subjectivity scores for the assessed tokens.
        
        :rtype: namedtuple of the form ``Sentiment(polarity, subjectivity, assessments)``
        """
        raise NotImplementedError

    @cached_property
    def polarity(self):
        """Return the polarity score as a float within the range [-1.0, 1.0]

        :rtype: float
        """
        return self.sentiment[0]

    @cached_property
    def subjectivity(self):
        '''Return the subjectivity score as a float within the range [0.0, 1.0]
        where 0.0 is very objective and 1.0 is very subjective.

        :rtype: float
        '''
        return self.sentiment[1]

    @cached_property
    def noun_phrases(self):
        """Returns a list of noun phrases for this blob."""
        return WordList([phrase.strip()
                         for phrase in self.np_extractor.extract(self.raw)
                         if len(phrase.split()) > 1])

    @cached_property
    def pos_tags(self):
        """Returns an list of tuples of the form (word, POS tag).

        Example:
        ::

            [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'),
                    ('Thursday', 'NNP'), ('morning', 'NN')]

        :rtype: list of tuples

        """
        return [(Word(word, pos_tag=t), unicode(t))
                for word, t in self.pos_tagger.tag(self.raw)
                # new keyword PatternTagger(include_punc=False)
                # if not PUNCTUATION_REGEX.match(unicode(t))
                ]

    tags = pos_tags

    @cached_property
    def word_counts(self):
        """Dictionary of word frequencies in this text."""
        counts = defaultdict(int)
        stripped_words = [lowerstrip(word) for word in self.words]
        for word in stripped_words:
            counts[word] += 1
        return counts

    @cached_property
    def np_counts(self):
        """Dictionary of noun phrase frequencies in this text."""
        counts = defaultdict(int)
        for phrase in self.noun_phrases:
            counts[phrase] += 1
        return counts

[docs]    def translate(self, from_lang=None, to="de"):
        """Translate the blob to another language."""
        if from_lang is None:
            from_lang = self.translator.detect(self.string)
        return self.__class__(
            self.translator.translate(
                self.raw,
                from_lang=from_lang,
                to_lang=to))

[docs]    def correct(self):
        """Attempt to correct the spelling of a blob.

        .. versionadded:: 0.6.0 (``textblob``)

        :rtype: :class:`BaseBlob <BaseBlob>`

        """
        # regex matches: contraction or word or punctuation or whitespace
        #tokens = nltk.tokenize.regexp_tokenize(self.raw, "\w*('\w*)+|\w+|[^\w\s]|\s")
        #corrected = (Word(w).correct() for w in tokens)
        #ret = ''.join(corrected)
        # return self.__class__(ret)
        raise NotImplementedError

    def _cmpkey(self):
        """Key used by ComparableMixin to implement all rich comparison
        operators."""
        return self.raw

    def _strkey(self):
        """Key used by StringlikeMixin to implement string methods."""
        return self.raw

    def __hash__(self):
        return hash(self._cmpkey())

    def __add__(self, other):
        """Concatenates two text objects the same way Python strings are
        concatenated.

        Arguments:
        - `other`: a string or a text object

        """
        if isinstance(other, basestring):
            return self.__class__(self.raw + other)
        elif isinstance(other, BaseBlob):
            return self.__class__(self.raw + other.raw)
        else:
            raise TypeError('Operands must be either strings or {0} objects'
                            .format(self.__class__.__name__))

[docs]    def split(self, sep=None, maxsplit=sys.maxsize):
        """Behaves like the built-in str.split() except returns a
        WordList.

        :rtype: :class:`WordList <WordList>`
        """
        return WordList(self._strkey().split(sep, maxsplit))


[docs]class Sentence(BaseBlob):

    '''A sentence within a TextBlob. Inherits from :class:`BaseBlob <BaseBlob>`.

    :param sentence: A string, the raw sentence.
    :param start_index: An int, the index where this sentence begins
                        in a TextBlob. If not given, defaults to 0.
    :param end_index: An int, the index where this sentence ends in
                        a TextBlob. If not given, defaults to the
                        length of the sentence - 1.
    '''

    def __init__(self, sentence, start_index=0, end_index=None,
                 *args, **kwargs):
        super(Sentence, self).__init__(sentence, *args, **kwargs)
        #: The start index within a TextBlob
        self.start = self.start_index = start_index
        #: The end index within a textBlob
        self.end = self.end_index = end_index or len(sentence) - 1

    #@cached_property
    # def sentiment(self):
        #"""Return a tuple of form (polarity, subjectivity ) where polarity
        # is a float within the range [-1.0, 1.0] and subjectivity is a float
        # within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is
        # very subjective.

        #:rtype: tuple of the form ``(polarity, subjectivity)``
        #"""
        #_wl = self.words
        #_lemmas = _wl.lemmatize()
        #_string = " ".join(_lemmas)
        # return self.analyzer.analyze(_string)

    #@cached_property
    # def polarity(self):
        #"""Return the polarity score as a float within the range [-1.0, 1.0]

        #:rtype: float
        #"""
        # return self.sentiment[0]

    #@cached_property
    # def subjectivity(self):
        #'''Return the subjectivity score as a float within the range [0.0, 1.0]
        # where 0.0 is very objective and 1.0 is very subjective.

        #:rtype: float
        #'''
        # return self.sentiment[1]

    @property
    def dict(self):
        """The dict representation of this sentence."""
        return {
            'raw': self.raw,
            'start_index': self.start_index,
            'end_index': self.end_index,
            'stripped': self.stripped,
            'noun_phrases': self.noun_phrases,
            'polarity': self.polarity,
            'subjectivity': self.subjectivity,
        }


[docs]class TextBlobDE(BaseBlob):

    '''``TextBlob`` class initialised with German default models:

    :param str text: A string.
    :param tokenizer: (optional) A tokenizer instance. If ``None``, defaults to
        :class:`NLTKPunktTokenizer() <textblob_de.tokenizers.NLTKPunktTokenizer>`.
    :param np_extractor: (optional) An NPExtractor instance. If ``None``,
        defaults to :class:`PatternParserNPExtractor()
        <textblob_de.np_extractors.PatternParserNPExtractor>`.
    :param pos_tagger: (optional) A Tagger instance. If ``None``, defaults to
        :class:`PatternTagger <textblob_de.taggers.PatternTagger>`.
    :param analyzer: (optional) A sentiment analyzer. If ``None``, defaults to
        :class:`PatternAnalyzer <textblob_de.sentiments.PatternAnalyzer>`.
    :param classifier: (optional) A classifier.

    '''
    @cached_property
    def sentences(self):
        """Return list of :class:`Sentence <Sentence>` objects."""
        return self._create_sentence_objects()

    @cached_property
    def words(self):
        """Return a list of word tokens. This excludes punctuation characters.
        If you want to include punctuation characters, access the ``tokens``
        property.

        :returns: A :class:`WordList <WordList>` of word tokens.

        """
        return WordList(
            word_tokenize(self.raw, self.tokenizer, include_punc=False))

    @property
    def raw_sentences(self):
        """List of strings, the raw sentences in the blob."""
        return [sentence.raw for sentence in self.sentences]

    @cached_property
    def sentiment(self):
        """Return a tuple of form (polarity, subjectivity ) where polarity
        is a float within the range [-1.0, 1.0] and subjectivity is a float
        within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is
        very subjective.

        :rtype: named tuple of the form ``Sentiment(polarity=0.0, subjectivity=0.0)``
        """
        #: Enhancement Issue #2
        #: adapted from 'textblob.en.sentiments.py'
        #: Return type declaration
        _RETURN_TYPE = namedtuple('Sentiment', ['polarity', 'subjectivity'])
        _polarity = 0
        _subjectivity = 0
        for s in self.sentences:
            _polarity += s.polarity
            _subjectivity += s.subjectivity
        try:
            polarity = _polarity / len(self.sentences)
        except ZeroDivisionError:
            polarity = 0.0
        try:
            subjectivity = _subjectivity / len(self.sentences)
        except ZeroDivisionError:
            subjectivity = 0.0
        return _RETURN_TYPE(polarity, subjectivity)

    @cached_property
    def sentiment_assessments(self):
        """Return a tuple of form (polarity, subjectivity, assessments ) where
        polarity is a float within the range [-1.0, 1.0], subjectivity is a
        float within the range [0.0, 1.0] where 0.0 is very objective and 1.0
        is very subjective, and assessments is a list of polarity and
        subjectivity scores for the assessed tokens.
        
        :rtype: namedtuple of the form ``Sentiment(polarity, subjectivity, assessments)``
        """
        raise NotImplementedError

    @cached_property
    def polarity(self):
        """Return the polarity score as a float within the range [-1.0, 1.0]

        :rtype: float
        """
        return self.sentiment[0]

    @cached_property
    def subjectivity(self):
        '''Return the subjectivity score as a float within the range [0.0, 1.0]
        where 0.0 is very objective and 1.0 is very subjective.

        :rtype: float
        '''
        return self.sentiment[1]

    @property
    def serialized(self):
        """Returns a list of each sentence's dict representation."""
        return [sentence.dict for sentence in self.sentences]

[docs]    def to_json(self, *args, **kwargs):
        """Return a json representation (str) of this blob. Takes the same
        arguments as json.dumps.

        .. versionadded:: 0.5.1 (``textblob``)

        """
        return json.dumps(self.serialized, *args, **kwargs)

    @property
    def json(self):
        """The json representation of this blob.

        .. versionchanged:: 0.5.1
            Made ``json`` a property instead of a method to restore backwards
            compatibility that was broken after version 0.4.0.

        """
        return self.to_json()

    def _create_sentence_objects(self):
        """Returns a list of Sentence objects from the raw text."""
        sentence_objects = []
        sentences = sent_tokenize(self.raw, tokenizer=self.tokenizer)
        char_index = 0  # Keeps track of character index within the blob
        for sent in sentences:
            # Compute the start and end indices of the sentence
            # within the blob. This only works if the sentence splitter
            # does not perform any character replacements or changes to
            # white space.
            # Working: NLTKPunktTokenizer
            # Not working: PatternTokenizer
            try:
                start_index = self.raw.index(sent, char_index)
                char_index += len(sent)
                end_index = start_index + len(sent)
            except ValueError:
                start_index = None
                end_index = None
            # Sentences share the same models as their parent blob
            s = Sentence(
                sent,
                start_index=start_index,
                end_index=end_index,
                tokenizer=self.tokenizer,
                np_extractor=self.np_extractor,
                pos_tagger=self.pos_tagger,
                analyzer=self.analyzer,
                parser=self.parser,
                classifier=self.classifier)
            sentence_objects.append(s)
        return sentence_objects


[docs]class BlobberDE(object):

    """A factory for TextBlobs that all share the same tagger, tokenizer,
    parser, classifier, and np_extractor.

    Usage:

        >>> from textblob_de import BlobberDE
        >>> from textblob_de.taggers import PatternTagger
        >>> from textblob.tokenizers import PatternTokenizer
        >>> tb = Blobber(pos_tagger=PatternTagger(), tokenizer=PatternTokenizer())
        >>> blob1 = tb("Das ist ein Blob.")
        >>> blob2 = tb("Dieser Blob benutzt die selben Tagger und Tokenizer.")
        >>> blob1.pos_tagger is blob2.pos_tagger
        True

    :param str text: A string.
    :param tokenizer: (optional) A tokenizer instance. If ``None``, defaults to
        :class:`NLTKPunktTokenizer() <textblob_de.tokenizers.NLTKPunktTokenizer>`.
    :param np_extractor: (optional) An NPExtractor instance. If ``None``,
        defaults to :class:`PatternParserNPExtractor()
        <textblob_de.np_extractors.PatternParserNPExtractor>`.
    :param pos_tagger: (optional) A Tagger instance. If ``None``, defaults to
        :class:`PatternTagger <textblob_de.taggers.PatternTagger>`.
    :param analyzer: (optional) A sentiment analyzer. If ``None``, defaults to
        :class:`PatternAnalyzer <textblob_de.sentiments.PatternAnalyzer>`.
    :param classifier: (optional) A classifier.

    .. versionadded:: 0.4.0 (``textblob``)

    """

[docs]    def __init__(self,
                 tokenizer=None,
                 pos_tagger=None,
                 np_extractor=None,
                 analyzer=None,
                 parser=None,
                 classifier=None):

        self.tokenizer = tokenizer if tokenizer is not None else NLTKPunktTokenizer()
        self.pos_tagger = pos_tagger if pos_tagger is not None \
            else PatternTagger(tokenizer=self.tokenizer)
        self.np_extractor = np_extractor if np_extractor is not None \
            else PatternParserNPExtractor(tokenizer=self.tokenizer)
        self.analyzer = analyzer if analyzer is not None \
            else PatternAnalyzer(tokenizer=self.tokenizer)
        self.parser = parser if parser is not None \
            else PatternParser(tokenizer=self.tokenizer)
        self.classifier = classifier if classifier is not None else None

        _initialize_models(
            self,
            self.tokenizer,
            self.pos_tagger,
            self.np_extractor,
            self.analyzer,
            self.parser,
            self.classifier)

[docs]    def __call__(self, text):
        """Return a new TextBlob object with this Blobber's ``np_extractor``,
        ``pos_tagger``, ``tokenizer``, ``analyzer``, and ``classifier``.

        :returns: A new :class:`TextBlob <TextBlob>`.

        """
        return TextBlobDE(
            text,
            tokenizer=self.tokenizer,
            pos_tagger=self.pos_tagger,
            np_extractor=self.np_extractor,
            analyzer=self.analyzer,
            parser=self.parser,
            classifier=self.classifier)

[docs]    def __repr__(self):
        classifier_name = self.classifier.__class__.__name__ + \
            "()" if self.classifier else "None"
        return (
            "Blobber(tokenizer={0}(), pos_tagger={1}(), "
            "np_extractor={2}(), analyzer={3}(), parser={4}(), classifier={5})") .format(
            self.tokenizer.__class__.__name__,
            self.pos_tagger.__class__.__name__,
            self.np_extractor.__class__.__name__,
            self.analyzer.__class__.__name__,
            self.parser.__class__.__name__,
            classifier_name)

    __str__ = __repr__