Module sentspace.Sentence

Expand source code
'''
'''

from collections import defaultdict
import typing
import sentspace.utils.text as text
import sentspace.utils.io as io
import re


class SentenceBatch(list):
    pass

class Sentence:
    '''
    a class to keep track of an individual sentence, its tokenized form,
    lemmas, POS tags, and so on.
    contains handy methods to perform the above segmentation/processing
    operations as well as methods for string representation, equality, and
    indexing (on tokenized form)
    '''
    _uid: str = None
    _raw: str = None
    _tokens: tuple = None
    _pos: tuple = None
    _lemmas: tuple = None
    _cleaned: tuple = None
    _content: tuple = None
    _lower: tuple = None
    
    def __init__(self, raw: str, uid: str = None, warn: bool = True) -> None:
        """Sentence constructor

        Args:
            raw (str): sentence in contiguous string form (will be segmented using a tokenizer)
            uid (str, optional): Unique ID of this sentence in a corpus. Defaults to None.
            warn (bool, optional): whether to warn that a UID is not supplied if one isn't given.
                                   can be set to False to suppress warning in case of intentional
                                   UID-less usage
        """        
        self._raw = re.sub(r' +', r' ', raw.strip())
        if uid is None and warn:
            io.log(f'no UID supplied for sentence {raw}\r', type='WARN')
        self._uid = uid
        self.OOV = defaultdict(set)

    def __hash__(self) -> int:
        return hash(self._raw)

    def __bool__(self) -> bool:
        '''Boolean value the sentence evaluates to.'''
        return self._raw != ''

    def __eq__(self, other) -> bool:
        """Equality operation with other sentences. Simply compares raw string.

        Args:
            other ([type]): [description]

        Returns:
            bool: True if self is equal to other, else False
        """        
        return str(self) == str(other)

    def __repr__(self) -> str:
        """Implement representation repr() of Sentence.
            prints out the UID followed by sentence

        Returns:
            str: representation of self
        """        
        return f'<{self._uid}>\t {self._raw:<32}'
    def __str__(self) -> str:
        """Returns raw string representation

        Returns:
            str: raw string
        """        
        return f'{self._raw}'

    def __len__(self) -> int:
        """Compute length of the sentence in terms of # of tokens

        Returns:
            int: # of tokens in this sentence (according to the default tokenization method `text.tokenize`)
        """        
        return len(self.tokens)

    def __getitem__(self, key: slice) -> str:
        """Support indexing into the tokens of the sentence

        Args:
            slice: slice of the tokenized sentence to retrieve

        Returns:
            str: token at the indexed position in the tokenized form
        """
        return self.tokens[key]

    def __iter__(self):
        '''we are iterable, so we return an iterator over tokens'''
        return iter(self.tokens)

    @property
    def uid(self) -> str:
        return self._uid

    @property
    def tokens(self) -> typing.Tuple[str]:
        return self.tokenized()
    # ^
    def tokenized(self, tokenize_method=text.tokenize) -> typing.Tuple[str]:
        """Tokenize and store tokenized form as a tuple. The tokenize_method is executed only
            the first time; each subsequent call to tokenized() returns the value of a stored variable

        Args:
            tokenize_method ([function], optional): a custom tokenization function. Defaults to utils.text.tokenize.

        Returns:
            tuple: tokenized form
        """        
        if self._tokens is None:
            self._tokens = tuple(tokenize_method(self._raw.lower()))
        return self._tokens

    # @property
    # def lowercased_tokens(self) -> typing.Tuple[str]:
    #     """Return lowercased tokens of this sentence

    #     Returns:
    #         typing.Tuple[str]: tuple of lowercased tokens from this sentence
    #     """        
    #     if self._lower is None:
    #         self._lower = [*map(lambda x: x.lower, self.tokenized())]
    #     return self._lower

    @property
    def pos_tags(self) -> typing.Tuple[str]:
        return self.pos_tagged()
    # ^
    def pos_tagged(self) -> typing.Tuple[str]:
        """POS-tag the sentence and return the result. Note, this method is also executed
            only once ever; each subsequent time a stored value is returned.

        Returns:
            tuple: POS tags
        """        
        if self._pos is None:
            self._pos = text.get_pos_tags(self.tokens)
        return self._pos

    @property
    def lemmas(self) -> typing.Tuple[str]:
        return self.lemmatized()
    # ^
    def lemmatized(self) -> typing.Tuple[str]:
        """Lemmatize and return the lemmas

        Returns:
            tuple: Lemmas
        """        
        if self._lemmas is None:
            self._lemmas = text.get_lemmatized_tokens(self.tokens, self.pos_tags)
        return self._lemmas

    @property
    def clean_tokens(self) -> typing.Tuple[str]:
        if self._cleaned is None:
            nonletters = text.get_nonletters(self.tokens, exceptions=[])
            self._cleaned = text.strip_words(self.tokens, method='punctuation', nonletters=nonletters)
        return self._cleaned

    @property
    def content_words(self) -> typing.List[int]:
        """Check whether each word is a 'content word'. Returns a list containing 0 or 1 as boolean
            mask indicating whether the word at that position is a content word or not

        Returns:
            List[int]: boolean mask indicating content words
        """        
        if self._content is None:
            self._content = text.get_is_content(self.pos_tags, content_pos=text.pos_for_content)
        return self._content

Classes

class Sentence (raw: str, uid: str = None, warn: bool = True)

a class to keep track of an individual sentence, its tokenized form, lemmas, POS tags, and so on. contains handy methods to perform the above segmentation/processing operations as well as methods for string representation, equality, and indexing (on tokenized form)

Sentence constructor

Args

raw : str
sentence in contiguous string form (will be segmented using a tokenizer)
uid : str, optional
Unique ID of this sentence in a corpus. Defaults to None.
warn : bool, optional
whether to warn that a UID is not supplied if one isn't given. can be set to False to suppress warning in case of intentional UID-less usage
Expand source code
class Sentence:
    '''
    a class to keep track of an individual sentence, its tokenized form,
    lemmas, POS tags, and so on.
    contains handy methods to perform the above segmentation/processing
    operations as well as methods for string representation, equality, and
    indexing (on tokenized form)
    '''
    _uid: str = None
    _raw: str = None
    _tokens: tuple = None
    _pos: tuple = None
    _lemmas: tuple = None
    _cleaned: tuple = None
    _content: tuple = None
    _lower: tuple = None
    
    def __init__(self, raw: str, uid: str = None, warn: bool = True) -> None:
        """Sentence constructor

        Args:
            raw (str): sentence in contiguous string form (will be segmented using a tokenizer)
            uid (str, optional): Unique ID of this sentence in a corpus. Defaults to None.
            warn (bool, optional): whether to warn that a UID is not supplied if one isn't given.
                                   can be set to False to suppress warning in case of intentional
                                   UID-less usage
        """        
        self._raw = re.sub(r' +', r' ', raw.strip())
        if uid is None and warn:
            io.log(f'no UID supplied for sentence {raw}\r', type='WARN')
        self._uid = uid
        self.OOV = defaultdict(set)

    def __hash__(self) -> int:
        return hash(self._raw)

    def __bool__(self) -> bool:
        '''Boolean value the sentence evaluates to.'''
        return self._raw != ''

    def __eq__(self, other) -> bool:
        """Equality operation with other sentences. Simply compares raw string.

        Args:
            other ([type]): [description]

        Returns:
            bool: True if self is equal to other, else False
        """        
        return str(self) == str(other)

    def __repr__(self) -> str:
        """Implement representation repr() of Sentence.
            prints out the UID followed by sentence

        Returns:
            str: representation of self
        """        
        return f'<{self._uid}>\t {self._raw:<32}'
    def __str__(self) -> str:
        """Returns raw string representation

        Returns:
            str: raw string
        """        
        return f'{self._raw}'

    def __len__(self) -> int:
        """Compute length of the sentence in terms of # of tokens

        Returns:
            int: # of tokens in this sentence (according to the default tokenization method `text.tokenize`)
        """        
        return len(self.tokens)

    def __getitem__(self, key: slice) -> str:
        """Support indexing into the tokens of the sentence

        Args:
            slice: slice of the tokenized sentence to retrieve

        Returns:
            str: token at the indexed position in the tokenized form
        """
        return self.tokens[key]

    def __iter__(self):
        '''we are iterable, so we return an iterator over tokens'''
        return iter(self.tokens)

    @property
    def uid(self) -> str:
        return self._uid

    @property
    def tokens(self) -> typing.Tuple[str]:
        return self.tokenized()
    # ^
    def tokenized(self, tokenize_method=text.tokenize) -> typing.Tuple[str]:
        """Tokenize and store tokenized form as a tuple. The tokenize_method is executed only
            the first time; each subsequent call to tokenized() returns the value of a stored variable

        Args:
            tokenize_method ([function], optional): a custom tokenization function. Defaults to utils.text.tokenize.

        Returns:
            tuple: tokenized form
        """        
        if self._tokens is None:
            self._tokens = tuple(tokenize_method(self._raw.lower()))
        return self._tokens

    # @property
    # def lowercased_tokens(self) -> typing.Tuple[str]:
    #     """Return lowercased tokens of this sentence

    #     Returns:
    #         typing.Tuple[str]: tuple of lowercased tokens from this sentence
    #     """        
    #     if self._lower is None:
    #         self._lower = [*map(lambda x: x.lower, self.tokenized())]
    #     return self._lower

    @property
    def pos_tags(self) -> typing.Tuple[str]:
        return self.pos_tagged()
    # ^
    def pos_tagged(self) -> typing.Tuple[str]:
        """POS-tag the sentence and return the result. Note, this method is also executed
            only once ever; each subsequent time a stored value is returned.

        Returns:
            tuple: POS tags
        """        
        if self._pos is None:
            self._pos = text.get_pos_tags(self.tokens)
        return self._pos

    @property
    def lemmas(self) -> typing.Tuple[str]:
        return self.lemmatized()
    # ^
    def lemmatized(self) -> typing.Tuple[str]:
        """Lemmatize and return the lemmas

        Returns:
            tuple: Lemmas
        """        
        if self._lemmas is None:
            self._lemmas = text.get_lemmatized_tokens(self.tokens, self.pos_tags)
        return self._lemmas

    @property
    def clean_tokens(self) -> typing.Tuple[str]:
        if self._cleaned is None:
            nonletters = text.get_nonletters(self.tokens, exceptions=[])
            self._cleaned = text.strip_words(self.tokens, method='punctuation', nonletters=nonletters)
        return self._cleaned

    @property
    def content_words(self) -> typing.List[int]:
        """Check whether each word is a 'content word'. Returns a list containing 0 or 1 as boolean
            mask indicating whether the word at that position is a content word or not

        Returns:
            List[int]: boolean mask indicating content words
        """        
        if self._content is None:
            self._content = text.get_is_content(self.pos_tags, content_pos=text.pos_for_content)
        return self._content

Instance variables

var clean_tokens : Tuple[str]
Expand source code
@property
def clean_tokens(self) -> typing.Tuple[str]:
    if self._cleaned is None:
        nonletters = text.get_nonletters(self.tokens, exceptions=[])
        self._cleaned = text.strip_words(self.tokens, method='punctuation', nonletters=nonletters)
    return self._cleaned
var content_words : List[int]

Check whether each word is a 'content word'. Returns a list containing 0 or 1 as boolean mask indicating whether the word at that position is a content word or not

Returns

List[int]
boolean mask indicating content words
Expand source code
@property
def content_words(self) -> typing.List[int]:
    """Check whether each word is a 'content word'. Returns a list containing 0 or 1 as boolean
        mask indicating whether the word at that position is a content word or not

    Returns:
        List[int]: boolean mask indicating content words
    """        
    if self._content is None:
        self._content = text.get_is_content(self.pos_tags, content_pos=text.pos_for_content)
    return self._content
var lemmas : Tuple[str]
Expand source code
@property
def lemmas(self) -> typing.Tuple[str]:
    return self.lemmatized()
var pos_tags : Tuple[str]
Expand source code
@property
def pos_tags(self) -> typing.Tuple[str]:
    return self.pos_tagged()
var tokens : Tuple[str]
Expand source code
@property
def tokens(self) -> typing.Tuple[str]:
    return self.tokenized()
var uid : str
Expand source code
@property
def uid(self) -> str:
    return self._uid

Methods

def lemmatized(self) ‑> Tuple[str]

Lemmatize and return the lemmas

Returns

tuple
Lemmas
Expand source code
def lemmatized(self) -> typing.Tuple[str]:
    """Lemmatize and return the lemmas

    Returns:
        tuple: Lemmas
    """        
    if self._lemmas is None:
        self._lemmas = text.get_lemmatized_tokens(self.tokens, self.pos_tags)
    return self._lemmas
def pos_tagged(self) ‑> Tuple[str]

POS-tag the sentence and return the result. Note, this method is also executed only once ever; each subsequent time a stored value is returned.

Returns

tuple
POS tags
Expand source code
def pos_tagged(self) -> typing.Tuple[str]:
    """POS-tag the sentence and return the result. Note, this method is also executed
        only once ever; each subsequent time a stored value is returned.

    Returns:
        tuple: POS tags
    """        
    if self._pos is None:
        self._pos = text.get_pos_tags(self.tokens)
    return self._pos
def tokenized(self, tokenize_method=<bound method TreebankWordTokenizer.tokenize of <nltk.tokenize.treebank.TreebankWordTokenizer object>>) ‑> Tuple[str]

Tokenize and store tokenized form as a tuple. The tokenize_method is executed only the first time; each subsequent call to tokenized() returns the value of a stored variable

Args

tokenize_method : [function], optional
a custom tokenization function. Defaults to utils.text.tokenize.

Returns

tuple
tokenized form
Expand source code
def tokenized(self, tokenize_method=text.tokenize) -> typing.Tuple[str]:
    """Tokenize and store tokenized form as a tuple. The tokenize_method is executed only
        the first time; each subsequent call to tokenized() returns the value of a stored variable

    Args:
        tokenize_method ([function], optional): a custom tokenization function. Defaults to utils.text.tokenize.

    Returns:
        tuple: tokenized form
    """        
    if self._tokens is None:
        self._tokens = tuple(tokenize_method(self._raw.lower()))
    return self._tokens
class SentenceBatch (*args, **kwargs)

Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list. The argument must be an iterable if specified.

Expand source code
class SentenceBatch(list):
    pass

Ancestors

  • builtins.list