Source code for textblob_de.sentiments

# -*- coding: utf-8 -*-
#
# Code adapted from ``textblob-fr`` sample extension.
#
# :repo: `https://github.com/sloria/textblob-fr`_
# :source: textblob_fr/sentiments.py
# :version: 2013-10-28 (a88e86a76a)
#
# :modified: 2014-08-04 <m.killer@langui.ch>
#
"""German sentiment analysis implementations.

Main resource for ``de-sentiment.xml``:

* `German Polarity Lexicon <http://bics.sentimental.li/index.php/downloads>`_
* See xml comment section in ``de-sentiment.xml`` for details

.. todo::

       enhance German Polarity Lexicon, using publicly available resources.

       Missing values:

       * Subjectivity
       * (Intensity)

       Possible sources:

       * `Pattern Project <http://www.clips.ua.ac.be/pages/pattern>`_

           * fr-sentiment.xml / en-sentiment.xml / nl-sentiment.xml

       * `IGGSA <https://sites.google.com/site/iggsahome/>`_

    :param tokenizer: (optional) A tokenizer instance. If ``None``, defaults to
        :class:`PatternTokenizer() <textblob_de.tokenizers.PatternTokenizer>`.

"""
from __future__ import absolute_import
from collections import namedtuple

import os

from textblob_de.base import BaseSentimentAnalyzer, CONTINUOUS
from textblob_de.lemmatizers import PatternParserLemmatizer
from textblob_de.packages import pattern_text
from textblob_de.tokenizers import PatternTokenizer


#################### PATTERN ANALYZER ####################################

# adapted from 'textblob_fr.fr.py'
#################### PATTERN SENTIMENT DETECTION #########################
try:
    MODULE = os.path.dirname(os.path.realpath(__file__))
except:
    MODULE = ""

_Sentiment = pattern_text.Sentiment


[docs]class Sentiment(_Sentiment): def load(self, path=None): _Sentiment.load(self, path) # Map "précaire" to "precaire" (without diacritics, +1% accuracy). if not path: for w, pos in list(self.items()): w0 = w if not w.endswith((u"à", u"è", u"é", u"ê", u"ï")): w = w.replace(u"à", "a") w = w.replace(u"é", "e") w = w.replace(u"è", "e") w = w.replace(u"ê", "e") w = w.replace(u"ï", "i") if w != w0: for pos, (p, s, i) in pos.items(): self.annotate(w, pos, p, s, i)
sentiment = Sentiment( path=os.path.join(MODULE, "data", "de-sentiment.xml"), synset=None, negations=( "nicht", "ohne", "nie", "nein", "kein", "keiner", "keine", "nichts"), modifiers=("RB", "JJ"), modifier=lambda w: w.endswith("lich"), #tokenizer = _tokenizer, language="de" ) def pattern_sentiment(text): return sentiment(text) #################### END SENTIMENT DETECTION ##################################
[docs]class PatternAnalyzer(BaseSentimentAnalyzer): '''Sentiment analyzer that uses the same implementation as the pattern library. Returns results as a tuple of the form: ``(polarity, subjectivity)`` ''' #: Enhancement Issue #2 #: adapted from 'textblob.en.sentiments.py' kind = CONTINUOUS #: Return type declaration RETURN_TYPE = namedtuple('Sentiment', ['polarity', 'subjectivity']) def __init__(self, tokenizer=None, lemmatizer=None, lemmatize=True): self.tokenizer = tokenizer if tokenizer is not None else PatternTokenizer() self.lemmatize = lemmatize if self.lemmatize: self.lemmatizer = lemmatizer if lemmatizer is not None \ else PatternParserLemmatizer(tokenizer=self.tokenizer)
[docs] def analyze(self, text): """Return the sentiment as a tuple of the form: ``(polarity, subjectivity)`` :param str text: A string. .. todo:: Figure out best format to be passed to the analyzer. There might be a better format than a string of space separated lemmas (e.g. with pos tags) but the parsing/tagging results look rather inaccurate and a wrong pos might prevent the lexicon lookup of an otherwise correctly lemmatized word form (or would it not?) - further checks needed. """ if self.lemmatize: text = self._lemmatize(text) return self.RETURN_TYPE(*pattern_sentiment(text))
def _lemmatize(self, raw): # returns a list of [(lemma1, tag1), (lemma2, tag2), ...] _lemmas = self.lemmatizer.lemmatize(raw) # pass to analyzer as a string _lemmas = " ".join([l for l, t in _lemmas]) return _lemmas
#################### END PATTERN ANALYZER ################################