Source code for textblob_de.tokenizers

# -*- coding: utf-8 -*-
#
# Code adapted from the main `TextBlob`_ library.
#
# :repo: `https://github.com/sloria/TextBlob`_
# :source: textblob/tokenizers.py
# :version: 2013-12-27 (fbdcaf2709)
#
# :modified: 2014-10-02 <m.killer@langui.ch>
#
"""Various tokenizer implementations."""
from __future__ import absolute_import

import string

from itertools import chain

import nltk
from textblob.utils import strip_punc
from textblob.base import BaseTokenizer
from textblob.decorators import requires_nltk_corpus

from textblob_de.packages import pattern_de
from textblob_de.packages import pattern_text

find_sentences = pattern_text.find_tokens
replacements = pattern_text.replacements
PUNCTUATION = string.punctuation
ABBREVIATIONS_DE = pattern_de.ABBREVIATIONS


[docs]class NLTKPunktTokenizer(BaseTokenizer):

    """Tokenizer included in ``nltk.tokenize.punkt`` package.

    This is the default tokenizer in ``textblob-de``

    **PROs:**

    * trained model available for German
    * deals with many abbreviations and common German tokenization problems oob

    **CONs:**

    * not very flexible (model has to be re-trained on your own corpus)

    """

    def __init__(self):
        self.tokens = []
        self.sent_tok = nltk.tokenize.load('tokenizers/punkt/german.pickle')
        self.word_tok = nltk.tokenize.TreebankWordTokenizer()

[docs]    def tokenize(self, text, include_punc=True, nested=False):
        """Return a list of word tokens.

        :param text: string of text.
        :param include_punc: (optional) whether to include punctuation as separate
            tokens. Default to True.
        :param nested: (optional) whether to return tokens as nested lists of
            sentences. Default to False.

        """
        self.tokens = [
            w for w in (
                self.word_tokenize(
                    s,
                    include_punc) for s in self.sent_tokenize(text))]
        if nested:
            return self.tokens
        else:
            return list(chain.from_iterable(self.tokens))

[docs]    @requires_nltk_corpus
    def sent_tokenize(self, text, **kwargs):
        """NLTK's sentence tokenizer (currently PunktSentenceTokenizer).

        Uses an unsupervised algorithm to build a model for abbreviation
        words, collocations, and words that start sentences, then uses
        that to find sentence boundaries.

        """
        sentences = self.sent_tok.tokenize(
            text,
            realign_boundaries=kwargs.get(
                "realign_boundaries",
                True))
        return sentences

[docs]    def word_tokenize(self, text, include_punc=True):
        """The Treebank tokenizer uses regular expressions to tokenize text as
        in Penn Treebank.

        It assumes that the text has already been segmented into sentences,
        e.g. using ``self.sent_tokenize()``.

        This tokenizer performs the following steps:

        - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
        - treat most punctuation characters as separate tokens
        - split off commas and single quotes, when followed by whitespace
        - separate periods that appear at the end of line

        Source: NLTK's docstring of ``TreebankWordTokenizer`` (accessed: 02/10/2014)

        """
        #: Do not process empty strings (Issue #3)
        if text.strip() == "":
            return []
        _tokens = self.word_tok.tokenize(text)
        #: Handle strings consisting of a single punctuation mark seperately (Issue #4)
        if len(_tokens) == 1:
            if _tokens[0] in PUNCTUATION:
                if include_punc:
                    return _tokens
                else:
                    return []
        if include_punc:
            return _tokens
        else:
            # Return each word token
            # Strips punctuation unless the word comes from a contraction
            # e.g. "gibt's" => ["gibt", "'s"] in "Heute gibt's viel zu tun!"
            # e.g. "hat's" => ["hat", "'s"]
            # e.g. "home." => ['home']
            words = [
                word if word.startswith("'") else strip_punc(
                    word,
                    all=False) for word in _tokens if strip_punc(
                    word,
                    all=False)]
            return list(words)


[docs]class PatternTokenizer(BaseTokenizer):

    """Tokenizer included in ``pattern.de`` package.

    **PROs:**

    * handling of emoticons
    * flexible implementations of abbreviations
    * can be adapted very easily

    **CONs:**

    * ordinal numbers cause sentence breaks
    * indices of Sentence() objects cannot be computed

    """

    def __init__(self):
        self.tokens = []

[docs]    def tokenize(self, text, include_punc=True, nested=False):
        """Return a list of word tokens.

        :param text: string of text.
        :param include_punc: (optional) whether to include punctuation as separate
            tokens. Default to True.

        """
        self.tokens = [
            w for w in (
                self.word_tokenize(
                    s,
                    include_punc) for s in self.sent_tokenize(text))]
        if nested:
            return self.tokens
        else:
            return list(chain.from_iterable(self.tokens))

[docs]    def sent_tokenize(self, text, **kwargs):
        """Returns a list of sentences.

        Each sentence is a space-separated string of tokens (words).
        Handles common cases of abbreviations (e.g., etc., ...).
        Punctuation marks are split from other words. Periods (or ?!) mark the end of a sentence.
        Headings without an ending period are inferred by line breaks.

        """

        sentences = find_sentences(text,
                                   punctuation=kwargs.get(
                                       "punctuation",
                                       PUNCTUATION),
                                   abbreviations=kwargs.get(
                                       "abbreviations",
                                       ABBREVIATIONS_DE),
                                   replace=kwargs.get("replace", replacements),
                                   linebreak=r"\n{2,}")
        return sentences

    def word_tokenize(self, sentences, include_punc=True):
        #: Do not process empty strings (Issue #3)
        if sentences.strip() == "":
            return []
        _tokens = sentences.split(" ")
        #: Handle strings consisting of a single punctuation mark seperately (Issue #4)
        if len(_tokens) == 1:
            if _tokens[0] in PUNCTUATION:
                if include_punc:
                    return _tokens
                else:
                    return []
        if include_punc:
            last_word = _tokens[-1]
            # Make sure that you do not separate '.' tokens into ['', '.']
            # (Issue #5)
            if last_word.endswith('.') and len(last_word) > 1:
                _tokens = _tokens[:-1] + [last_word[:-1], '.']
            return _tokens
        else:
            # Return each word token
            # Strips punctuation unless the word comes from a contraction
            # e.g. "gibt's" => ["gibt", "'s"] in "Heute gibt's viel zu tun!"
            # e.g. "hat's" => ["hat", "'s"]
            # e.g. "home." => ['home']
            words = [
                word if word.startswith("'") else strip_punc(
                    word,
                    all=False) for word in _tokens if strip_punc(
                    word,
                    all=False)]
            return list(words)


[docs]class WordTokenizer(BaseTokenizer):

    """Generic word tokenization class, using tokenizer specified in
    TextBlobDE() instance.

    You can also submit the tokenizer as keyword argument:
    ``WordTokenizer(tokenizer=NLTKPunktTokenizer())``

    Enables WordTokenizer().itokenize generator that would be lost otherwise.

    Default: NLTKPunktTokenizer().word_tokenize(text, include_punc=True)

    Aim: Not to break core API of the main `TextBlob`_ library.

    :param tokenizer: (optional) A tokenizer instance. If ``None``, defaults to
        :class:`NLTKPunktTokenizer() <textblob_de.tokenizers.NLTKPunktTokenizer>`.

    .. _TextBlob: http://textblob.readthedocs.org/

    """

    def __init__(self, tokenizer=None, *args, **kwargs):
        # make sure that tokenizer is not referring to this class
        self.tokenizer = tokenizer if tokenizer is not None and \
            not isinstance(tokenizer, WordTokenizer) else NLTKPunktTokenizer()

[docs]    def tokenize(self, text, include_punc=True, **kwargs):
        """Return a list of word tokens.

        :param text: string of text.
        :param include_punc: (optional) whether to include punctuation as separate
            tokens. Default to True.

        """
        return self.tokenizer.word_tokenize(text, include_punc, **kwargs)

[docs]    def word_tokenize(self, text, include_punc=True):
        '''Compatibility method to tokenizers included in ``textblob-de``'''
        return self.tokenize(text, include_punc)


[docs]class SentenceTokenizer(BaseTokenizer):

    """Generic sentence tokenization class, using tokenizer specified in
    TextBlobDE() instance.

    Enables SentenceTokenizer().itokenize generator that would be lost otherwise.

    Aim: Not to break core API of the main `TextBlob`_ library.

    :param tokenizer: (optional) A tokenizer instance. If ``None``, defaults to
        :class:`NLTKPunktTokenizer() <textblob_de.tokenizers.NLTKPunktTokenizer>`.

    .. _TextBlob: http://textblob.readthedocs.org/

    """

    def __init__(self, tokenizer=None, *args, **kwargs):
        # make sure that tokenizer is not referring to this class
        self.tokenizer = tokenizer if tokenizer is not None and not isinstance(
            tokenizer,
            SentenceTokenizer) else NLTKPunktTokenizer()

[docs]    def tokenize(self, text, **kwargs):
        """Return a list of word tokens.

        :param text: string of text.
        :param include_punc: (optional) whether to include punctuation as separate
            tokens. Default to True.

        """
        return self.tokenizer.sent_tokenize(text, **kwargs)

[docs]    def sent_tokenize(self, text, **kwargs):
        '''Compatibility method to tokenizers included in ``textblob-de``'''
        return self.tokenize(text, **kwargs)


[docs]def sent_tokenize(text, tokenizer=None):
    """Convenience function for tokenizing sentences (not iterable).

    If tokenizer is not specified, the default tokenizer NLTKPunktTokenizer()
    is used (same behaviour as in the main `TextBlob`_ library).

    This function returns the sentences as a generator object.

    .. _TextBlob: http://textblob.readthedocs.org/

    """
    _tokenizer = tokenizer if tokenizer is not None else NLTKPunktTokenizer()
    return SentenceTokenizer(tokenizer=_tokenizer).itokenize(text)


[docs]def word_tokenize(text, tokenizer=None, include_punc=True, *args, **kwargs):
    """Convenience function for tokenizing text into words.

    NOTE: NLTK's word tokenizer expects sentences as input, so the text will be
    tokenized to sentences before being tokenized to words.

    This function returns an itertools chain object (generator).

    """
    _tokenizer = tokenizer if tokenizer is not None else NLTKPunktTokenizer()
    words = chain.from_iterable(
        WordTokenizer(tokenizer=_tokenizer).itokenize(sentence, include_punc,
                                                      *args, **kwargs)
        for sentence in sent_tokenize(text, tokenizer=_tokenizer))
    return words