Source code for textblob_de.tokenizers

# -*- coding: utf-8 -*-
#
# Code adapted from the main `TextBlob`_ library.
#
# :repo: `https://github.com/sloria/TextBlob`_
# :source: textblob/tokenizers.py
# :version: 2013-12-27 (fbdcaf2709)
#
# :modified: 2014-10-02 <m.killer@langui.ch>
#
"""Various tokenizer implementations."""
from __future__ import absolute_import

import string

from itertools import chain

import nltk
from textblob.utils import strip_punc
from textblob.base import BaseTokenizer
from textblob.decorators import requires_nltk_corpus

from textblob_de.packages import pattern_de
from textblob_de.packages import pattern_text

find_sentences = pattern_text.find_tokens
replacements = pattern_text.replacements
PUNCTUATION = string.punctuation
ABBREVIATIONS_DE = pattern_de.ABBREVIATIONS


[docs]class NLTKPunktTokenizer(BaseTokenizer): """Tokenizer included in ``nltk.tokenize.punkt`` package. This is the default tokenizer in ``textblob-de`` **PROs:** * trained model available for German * deals with many abbreviations and common German tokenization problems oob **CONs:** * not very flexible (model has to be re-trained on your own corpus) """ def __init__(self): self.tokens = [] self.sent_tok = nltk.tokenize.load('tokenizers/punkt/german.pickle') self.word_tok = nltk.tokenize.TreebankWordTokenizer()
[docs] def tokenize(self, text, include_punc=True, nested=False): """Return a list of word tokens. :param text: string of text. :param include_punc: (optional) whether to include punctuation as separate tokens. Default to True. :param nested: (optional) whether to return tokens as nested lists of sentences. Default to False. """ self.tokens = [ w for w in ( self.word_tokenize( s, include_punc) for s in self.sent_tokenize(text))] if nested: return self.tokens else: return list(chain.from_iterable(self.tokens))
[docs] @requires_nltk_corpus def sent_tokenize(self, text, **kwargs): """NLTK's sentence tokenizer (currently PunktSentenceTokenizer). Uses an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences, then uses that to find sentence boundaries. """ sentences = self.sent_tok.tokenize( text, realign_boundaries=kwargs.get( "realign_boundaries", True)) return sentences
[docs] def word_tokenize(self, text, include_punc=True): """The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank. It assumes that the text has already been segmented into sentences, e.g. using ``self.sent_tokenize()``. This tokenizer performs the following steps: - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll`` - treat most punctuation characters as separate tokens - split off commas and single quotes, when followed by whitespace - separate periods that appear at the end of line Source: NLTK's docstring of ``TreebankWordTokenizer`` (accessed: 02/10/2014) """ #: Do not process empty strings (Issue #3) if text.strip() == "": return [] _tokens = self.word_tok.tokenize(text) #: Handle strings consisting of a single punctuation mark seperately (Issue #4) if len(_tokens) == 1: if _tokens[0] in PUNCTUATION: if include_punc: return _tokens else: return [] if include_punc: return _tokens else: # Return each word token # Strips punctuation unless the word comes from a contraction # e.g. "gibt's" => ["gibt", "'s"] in "Heute gibt's viel zu tun!" # e.g. "hat's" => ["hat", "'s"] # e.g. "home." => ['home'] words = [ word if word.startswith("'") else strip_punc( word, all=False) for word in _tokens if strip_punc( word, all=False)] return list(words)
[docs]class PatternTokenizer(BaseTokenizer): """Tokenizer included in ``pattern.de`` package. **PROs:** * handling of emoticons * flexible implementations of abbreviations * can be adapted very easily **CONs:** * ordinal numbers cause sentence breaks * indices of Sentence() objects cannot be computed """ def __init__(self): self.tokens = []
[docs] def tokenize(self, text, include_punc=True, nested=False): """Return a list of word tokens. :param text: string of text. :param include_punc: (optional) whether to include punctuation as separate tokens. Default to True. """ self.tokens = [ w for w in ( self.word_tokenize( s, include_punc) for s in self.sent_tokenize(text))] if nested: return self.tokens else: return list(chain.from_iterable(self.tokens))
[docs] def sent_tokenize(self, text, **kwargs): """Returns a list of sentences. Each sentence is a space-separated string of tokens (words). Handles common cases of abbreviations (e.g., etc., ...). Punctuation marks are split from other words. Periods (or ?!) mark the end of a sentence. Headings without an ending period are inferred by line breaks. """ sentences = find_sentences(text, punctuation=kwargs.get( "punctuation", PUNCTUATION), abbreviations=kwargs.get( "abbreviations", ABBREVIATIONS_DE), replace=kwargs.get("replace", replacements), linebreak=r"\n{2,}") return sentences
def word_tokenize(self, sentences, include_punc=True): #: Do not process empty strings (Issue #3) if sentences.strip() == "": return [] _tokens = sentences.split(" ") #: Handle strings consisting of a single punctuation mark seperately (Issue #4) if len(_tokens) == 1: if _tokens[0] in PUNCTUATION: if include_punc: return _tokens else: return [] if include_punc: last_word = _tokens[-1] # Make sure that you do not separate '.' tokens into ['', '.'] # (Issue #5) if last_word.endswith('.') and len(last_word) > 1: _tokens = _tokens[:-1] + [last_word[:-1], '.'] return _tokens else: # Return each word token # Strips punctuation unless the word comes from a contraction # e.g. "gibt's" => ["gibt", "'s"] in "Heute gibt's viel zu tun!" # e.g. "hat's" => ["hat", "'s"] # e.g. "home." => ['home'] words = [ word if word.startswith("'") else strip_punc( word, all=False) for word in _tokens if strip_punc( word, all=False)] return list(words)
[docs]class WordTokenizer(BaseTokenizer): """Generic word tokenization class, using tokenizer specified in TextBlobDE() instance. You can also submit the tokenizer as keyword argument: ``WordTokenizer(tokenizer=NLTKPunktTokenizer())`` Enables WordTokenizer().itokenize generator that would be lost otherwise. Default: NLTKPunktTokenizer().word_tokenize(text, include_punc=True) Aim: Not to break core API of the main `TextBlob`_ library. :param tokenizer: (optional) A tokenizer instance. If ``None``, defaults to :class:`NLTKPunktTokenizer() <textblob_de.tokenizers.NLTKPunktTokenizer>`. .. _TextBlob: http://textblob.readthedocs.org/ """ def __init__(self, tokenizer=None, *args, **kwargs): # make sure that tokenizer is not referring to this class self.tokenizer = tokenizer if tokenizer is not None and \ not isinstance(tokenizer, WordTokenizer) else NLTKPunktTokenizer()
[docs] def tokenize(self, text, include_punc=True, **kwargs): """Return a list of word tokens. :param text: string of text. :param include_punc: (optional) whether to include punctuation as separate tokens. Default to True. """ return self.tokenizer.word_tokenize(text, include_punc, **kwargs)
[docs] def word_tokenize(self, text, include_punc=True): '''Compatibility method to tokenizers included in ``textblob-de``''' return self.tokenize(text, include_punc)
[docs]class SentenceTokenizer(BaseTokenizer): """Generic sentence tokenization class, using tokenizer specified in TextBlobDE() instance. Enables SentenceTokenizer().itokenize generator that would be lost otherwise. Aim: Not to break core API of the main `TextBlob`_ library. :param tokenizer: (optional) A tokenizer instance. If ``None``, defaults to :class:`NLTKPunktTokenizer() <textblob_de.tokenizers.NLTKPunktTokenizer>`. .. _TextBlob: http://textblob.readthedocs.org/ """ def __init__(self, tokenizer=None, *args, **kwargs): # make sure that tokenizer is not referring to this class self.tokenizer = tokenizer if tokenizer is not None and not isinstance( tokenizer, SentenceTokenizer) else NLTKPunktTokenizer()
[docs] def tokenize(self, text, **kwargs): """Return a list of word tokens. :param text: string of text. :param include_punc: (optional) whether to include punctuation as separate tokens. Default to True. """ return self.tokenizer.sent_tokenize(text, **kwargs)
[docs] def sent_tokenize(self, text, **kwargs): '''Compatibility method to tokenizers included in ``textblob-de``''' return self.tokenize(text, **kwargs)
[docs]def sent_tokenize(text, tokenizer=None): """Convenience function for tokenizing sentences (not iterable). If tokenizer is not specified, the default tokenizer NLTKPunktTokenizer() is used (same behaviour as in the main `TextBlob`_ library). This function returns the sentences as a generator object. .. _TextBlob: http://textblob.readthedocs.org/ """ _tokenizer = tokenizer if tokenizer is not None else NLTKPunktTokenizer() return SentenceTokenizer(tokenizer=_tokenizer).itokenize(text)
[docs]def word_tokenize(text, tokenizer=None, include_punc=True, *args, **kwargs): """Convenience function for tokenizing text into words. NOTE: NLTK's word tokenizer expects sentences as input, so the text will be tokenized to sentences before being tokenized to words. This function returns an itertools chain object (generator). """ _tokenizer = tokenizer if tokenizer is not None else NLTKPunktTokenizer() words = chain.from_iterable( WordTokenizer(tokenizer=_tokenizer).itokenize(sentence, include_punc, *args, **kwargs) for sentence in sent_tokenize(text, tokenizer=_tokenizer)) return words