Module sentspace.lexical.utils

Expand source code
import os
import pickle

import numpy as np
import sentspace.utils
from sentspace.utils import io, text
from sentspace.utils.caching import cache_to_mem #, cache_to_disk
from sentspace.utils.misc import merge_lists
from pathlib import Path


# --------- Lexical features

# list of acceptable feature terms to load_databases(...)
# @cache_to_mem
def get_feature_list():
    return ['NRC_Arousal', # (Mohammad 2018)
            'NRC_Valence', # (Mohammad 2018)
            'NRC_Dominance', # (Mohammad 2018)
            'OSC', # Orthographic-semantics consistency (Marelli & Amenta, 2018)
            'aoa', # Age of Acquisition (Kuperman et al., 2012)
            'concreteness', # (Brysbaert et al., 2014)
            'lexical_decision_RT', # (Balota et al., 2007)
            'log_contextual_diversity', 
            'log_lexical_frequency', 
            'n_orthographic_neighbors', 
            'num_morpheme',
            'prevalence', 
            'surprisal-1', 
            'surprisal-2', 
            'surprisal-3', 
            'surprisal-4',
            'total_degree_centrality',
            'imageability', # Glasgow norms (Scott et al, 2019) https://link.springer.com/article/10.3758/s13428-018-1099-3#Sec1
            'body-object-interaction', # (Pexman et al, 2019)  https://link.springer.com/article/10.3758/s13428-018-1171-z#Sec9
            'zipf', # SUBTLEXus
            'socialness', # (Diveica et al., 2022) https://link.springer.com/article/10.3758/s13428-022-01810-x#Sec15
            # Lancaster norms (Lynott et al, 2020) https://link.springer.com/article/10.3758/s13428-019-01316-z#Bib1
            'Auditory',
            'Gustatory',
            'Interoceptive',
            'Haptic',
            'Olfactory',
            'Visual',
            'Foot_leg',
            'Hand_arm',
            'Head',
            'Mouth',
            'Torso',
            ]

def get_feature_list_using_third_party_libraries():
    return ['polysemy', 'num_morpheme_poly']

def get_feature_list_requiring_calculation():
    return ['PMI']

def get_user_contributed_features():
    return []

def get_all_features(sentence: 'sentspace.Sentence.Sentence', databases):
    """
    Given list of words, return dict mapping feature to list of feature values
    """
    
    result = {}
    for feature in (get_feature_list() +
                    get_feature_list_using_third_party_libraries() + 
                    get_user_contributed_features()):
        # we don't want to compute num_morpheme using the dictionary DB by default. 
        # we want to do it only if the polyglot library is unavailable.
        if feature == 'num_morpheme': 
            continue
        computed_feature = get_feature(sentence, feature, databases) 
        # even though we are computing "num_morpheme_poly", we want to
        # record it as "num_morpheme", since the _poly suffix comes from the polyglot library
        # that provides the feature
        if feature == 'num_morpheme_polyglot':
            try:
                import polyglot
            except ImportError:
                feature = 'num_morpheme'
        result[feature] = computed_feature
    return result


def get_feature(sentence: 'sentspace.Sentence.Sentence', feature, databases={}):
    '''
    get specific `feature` for the tokens in `sentence`; fall back to using `lemmas` if necessary
    '''

    def get_feature_(token, lemma, feature):
        """given a `word` and a feature to extract, returns the value of that
            feature for the `word` using available databases

        Args:
            word (str): the token (word) to extract a feature for
            feature (str): name identifier of the feature acc to predefined convention
            databases (dict, in-scope): dictionary of feature --> (word --> feature_value) dictionaries. 
                                        Defaults to {}.

        Returns:
            Any: feature value
        """
        # if the feature is from a database we have on disk
        # database can be a dictionary or an object that implements
        # get(key, default)
        if feature in get_feature_list():
            feature_dict = databases[feature]            
            try:
                return feature_dict[token]
            except KeyError as e:
                try:
                    return feature_dict[lemma]
                except KeyError as e_:
                    return np.nan

        # Other databases we use from libraries we load such as NLTK-Wordnet and Polyglot
        elif feature in get_feature_list_using_third_party_libraries():
            if feature == 'polysemy':
                from nltk.corpus import wordnet
                # first try it with the token itself
                if (synsets := wordnet.synsets(token)):
                    return len(synsets) # TODO does a word's synset include itself?
                # if token is OOV, try again with the lemma
                elif (synsets := wordnet.synsets(lemma)):
                    return len(synsets)
                # otherwise the len of its synset is 1 (itself)
                return 1
            elif feature == 'num_morpheme_poly':
                try:
                    from polyglot.text import Word
                    # try first to obtain # morphemes of the token
                    if (morphed := Word(token, language='en').morphemes):
                        return len(morphed)
                    # otherwise, try using the lemmatized form
                    elif (morphed := Word(lemma, language='en').morphemes):
                        return len(morphed)
                    # if both token and lemma OOV, then return nan? or 1 (i.e. full word is the morpheme?)
                    # but that only means we failed to analyze its morphology, not necessarily that is
                    # *is* a single morpheme
                    return 1 # np.nan
                except ImportError as e:
                    # fall back to simply using a dictionary-based feature
                    # TODO make a note of this somewhere
                    io.log(e.msg, type='WARN')
                    return get_feature_(token, lemma, 'num_morpheme')

        else:
            raise ValueError(f'unable to compute unknown feature `{feature}`')


    features_list = []

    for token, lemma in zip(sentence.tokens, sentence.lemmas):
        features_list += [get_feature_(token, lemma, feature)]

    return features_list


def return_percentile_df(bench_df, usr_df):
    # Initialize df
    perc_df = pd.DataFrame(columns=usr_df.columns)
    # For each sentence get the percentile scores for each feature
    for index, row in usr_df.iterrows():
        temp_df = {}
        # Iterate through the features
        for col in usr_df.columns:
            if col == 'Sentence no.':
                temp_df[col] = row[col]
                continue
            #print(percentileofscore(bench_df[col],row[col]))
            temp_df[col] = percentileofscore(bench_df[col], row[col])
            #pdb.set_trace()
        # Append percentile feature row
        perc_df = perc_df.append(temp_df, ignore_index=True)

    perc_df.drop(columns=['Sentence no.'])
    return perc_df




@cache_to_mem
def load_databases(features='all', path='~/.cache/sentspace/', 
                   ignore_case=True,
                  ):
    """
    Load dicts mapping word to feature value
    If one feature, provide in list format
    """
    path = str(Path(path).expanduser().resolve()) + '/'
    io.log("loading databases with all features")
    databases = {}
    if features == 'all':
        features = get_feature_list()
    for feature in features:
        if not os.path.exists(path+feature+'.pkl'):
            sentspace.utils.s3.load_feature(key=feature+'.pkl', root_dir=path)
        with open(path+feature+'.pkl', 'rb') as f:
            d = pickle.load(f)
            # if ignore_case:  # add lowercase version to feature database
            #     for key, val in d.copy().items():
            #         d[str(key).lower()] = val
            databases[feature] = d

    sanity_check_databases(databases)
    io.log("---done--- loading databases with all features")
    return databases


def sanity_check_databases(databases):
    '''
    perform sanity checks upon loading various datasets
    to ensure validity of the loaded data
    '''

    assert databases['NRC_Arousal']['happy'] == 0.735
    assert databases['NRC_Valence']['happy'] == 1
    assert databases['OSC']['happy'] == 0.951549893181384
    assert abs(databases['aoa']['a'] - 2.893384) < 1e-4
    assert databases['concreteness']['roadsweeper'] == 4.85
    # assert abs(databases['imag']['abbey'] - 5.344) < 1e-4
    assert databases['total_degree_centrality']['a'] == 30
    assert databases['lexical_decision_RT']['a'] == 798.917
    assert abs(databases['log_contextual_diversity']['a'] - 3.9234) < 1e-4
    assert abs(databases['log_lexical_frequency']['a'] - 6.0175) < 1e-4
    assert databases['n_orthographic_neighbors']['a'] == 950.59
    assert databases['num_morpheme']['abbreviated'] == 4
    assert abs(databases['prevalence']['a'] - 1.917) < 1e-3
    assert databases['surprisal-3']['beekeeping'] == 10.258

Functions

def get_all_features(sentence: Sentence, databases)

Given list of words, return dict mapping feature to list of feature values

Expand source code
def get_all_features(sentence: 'sentspace.Sentence.Sentence', databases):
    """
    Given list of words, return dict mapping feature to list of feature values
    """
    
    result = {}
    for feature in (get_feature_list() +
                    get_feature_list_using_third_party_libraries() + 
                    get_user_contributed_features()):
        # we don't want to compute num_morpheme using the dictionary DB by default. 
        # we want to do it only if the polyglot library is unavailable.
        if feature == 'num_morpheme': 
            continue
        computed_feature = get_feature(sentence, feature, databases) 
        # even though we are computing "num_morpheme_poly", we want to
        # record it as "num_morpheme", since the _poly suffix comes from the polyglot library
        # that provides the feature
        if feature == 'num_morpheme_polyglot':
            try:
                import polyglot
            except ImportError:
                feature = 'num_morpheme'
        result[feature] = computed_feature
    return result
def get_feature(sentence: Sentence, feature, databases={})

get specific feature for the tokens in sentence; fall back to using lemmas if necessary

Expand source code
def get_feature(sentence: 'sentspace.Sentence.Sentence', feature, databases={}):
    '''
    get specific `feature` for the tokens in `sentence`; fall back to using `lemmas` if necessary
    '''

    def get_feature_(token, lemma, feature):
        """given a `word` and a feature to extract, returns the value of that
            feature for the `word` using available databases

        Args:
            word (str): the token (word) to extract a feature for
            feature (str): name identifier of the feature acc to predefined convention
            databases (dict, in-scope): dictionary of feature --> (word --> feature_value) dictionaries. 
                                        Defaults to {}.

        Returns:
            Any: feature value
        """
        # if the feature is from a database we have on disk
        # database can be a dictionary or an object that implements
        # get(key, default)
        if feature in get_feature_list():
            feature_dict = databases[feature]            
            try:
                return feature_dict[token]
            except KeyError as e:
                try:
                    return feature_dict[lemma]
                except KeyError as e_:
                    return np.nan

        # Other databases we use from libraries we load such as NLTK-Wordnet and Polyglot
        elif feature in get_feature_list_using_third_party_libraries():
            if feature == 'polysemy':
                from nltk.corpus import wordnet
                # first try it with the token itself
                if (synsets := wordnet.synsets(token)):
                    return len(synsets) # TODO does a word's synset include itself?
                # if token is OOV, try again with the lemma
                elif (synsets := wordnet.synsets(lemma)):
                    return len(synsets)
                # otherwise the len of its synset is 1 (itself)
                return 1
            elif feature == 'num_morpheme_poly':
                try:
                    from polyglot.text import Word
                    # try first to obtain # morphemes of the token
                    if (morphed := Word(token, language='en').morphemes):
                        return len(morphed)
                    # otherwise, try using the lemmatized form
                    elif (morphed := Word(lemma, language='en').morphemes):
                        return len(morphed)
                    # if both token and lemma OOV, then return nan? or 1 (i.e. full word is the morpheme?)
                    # but that only means we failed to analyze its morphology, not necessarily that is
                    # *is* a single morpheme
                    return 1 # np.nan
                except ImportError as e:
                    # fall back to simply using a dictionary-based feature
                    # TODO make a note of this somewhere
                    io.log(e.msg, type='WARN')
                    return get_feature_(token, lemma, 'num_morpheme')

        else:
            raise ValueError(f'unable to compute unknown feature `{feature}`')


    features_list = []

    for token, lemma in zip(sentence.tokens, sentence.lemmas):
        features_list += [get_feature_(token, lemma, feature)]

    return features_list
def get_feature_list()
Expand source code
def get_feature_list():
    return ['NRC_Arousal', # (Mohammad 2018)
            'NRC_Valence', # (Mohammad 2018)
            'NRC_Dominance', # (Mohammad 2018)
            'OSC', # Orthographic-semantics consistency (Marelli & Amenta, 2018)
            'aoa', # Age of Acquisition (Kuperman et al., 2012)
            'concreteness', # (Brysbaert et al., 2014)
            'lexical_decision_RT', # (Balota et al., 2007)
            'log_contextual_diversity', 
            'log_lexical_frequency', 
            'n_orthographic_neighbors', 
            'num_morpheme',
            'prevalence', 
            'surprisal-1', 
            'surprisal-2', 
            'surprisal-3', 
            'surprisal-4',
            'total_degree_centrality',
            'imageability', # Glasgow norms (Scott et al, 2019) https://link.springer.com/article/10.3758/s13428-018-1099-3#Sec1
            'body-object-interaction', # (Pexman et al, 2019)  https://link.springer.com/article/10.3758/s13428-018-1171-z#Sec9
            'zipf', # SUBTLEXus
            'socialness', # (Diveica et al., 2022) https://link.springer.com/article/10.3758/s13428-022-01810-x#Sec15
            # Lancaster norms (Lynott et al, 2020) https://link.springer.com/article/10.3758/s13428-019-01316-z#Bib1
            'Auditory',
            'Gustatory',
            'Interoceptive',
            'Haptic',
            'Olfactory',
            'Visual',
            'Foot_leg',
            'Hand_arm',
            'Head',
            'Mouth',
            'Torso',
            ]
def get_feature_list_requiring_calculation()
Expand source code
def get_feature_list_requiring_calculation():
    return ['PMI']
def get_feature_list_using_third_party_libraries()
Expand source code
def get_feature_list_using_third_party_libraries():
    return ['polysemy', 'num_morpheme_poly']
def get_user_contributed_features()
Expand source code
def get_user_contributed_features():
    return []
def return_percentile_df(bench_df, usr_df)
Expand source code
def return_percentile_df(bench_df, usr_df):
    # Initialize df
    perc_df = pd.DataFrame(columns=usr_df.columns)
    # For each sentence get the percentile scores for each feature
    for index, row in usr_df.iterrows():
        temp_df = {}
        # Iterate through the features
        for col in usr_df.columns:
            if col == 'Sentence no.':
                temp_df[col] = row[col]
                continue
            #print(percentileofscore(bench_df[col],row[col]))
            temp_df[col] = percentileofscore(bench_df[col], row[col])
            #pdb.set_trace()
        # Append percentile feature row
        perc_df = perc_df.append(temp_df, ignore_index=True)

    perc_df.drop(columns=['Sentence no.'])
    return perc_df
def sanity_check_databases(databases)

perform sanity checks upon loading various datasets to ensure validity of the loaded data

Expand source code
def sanity_check_databases(databases):
    '''
    perform sanity checks upon loading various datasets
    to ensure validity of the loaded data
    '''

    assert databases['NRC_Arousal']['happy'] == 0.735
    assert databases['NRC_Valence']['happy'] == 1
    assert databases['OSC']['happy'] == 0.951549893181384
    assert abs(databases['aoa']['a'] - 2.893384) < 1e-4
    assert databases['concreteness']['roadsweeper'] == 4.85
    # assert abs(databases['imag']['abbey'] - 5.344) < 1e-4
    assert databases['total_degree_centrality']['a'] == 30
    assert databases['lexical_decision_RT']['a'] == 798.917
    assert abs(databases['log_contextual_diversity']['a'] - 3.9234) < 1e-4
    assert abs(databases['log_lexical_frequency']['a'] - 6.0175) < 1e-4
    assert databases['n_orthographic_neighbors']['a'] == 950.59
    assert databases['num_morpheme']['abbreviated'] == 4
    assert abs(databases['prevalence']['a'] - 1.917) < 1e-3
    assert databases['surprisal-3']['beekeeping'] == 10.258