Module sentspace.syntax
Expand source code
from collections import defaultdict
import os
import sentspace
from pathlib import Path
import pandas as pd
from nltk.tree import ParentedTree
from sentspace.syntax import utils
from sentspace.syntax.features import DLT, Feature, LeftCorner, Tree
from sentspace.utils import io, text
from sentspace.utils.caching import cache_to_disk
__pdoc__ = {'compute_tree_dlt_left_corner': False,
'utils.calcEmbd': False,
'utils.calcDLT': False,
'utils.printlemmas': False,
'utils.tree': False
}
os.environ['PERL_BADLANG'] = '0'
def get_features(sentence: str = None, identifier: str = None,
dlt: bool = True, left_corner: bool = True,
syntax_port: int = 8000) -> dict:
"""executes the syntactic features pipeline
Args:
sentence (str, optional): exactly 1 sentence [None].
dlt (bool, optional): calculate DLT feature? [False].
left_corner (bool, optional): calculate Left Corner feature? [False].
Returns:
sentspace.syntax.features.Feature: a Feature instance with appropriate attributes
"""
# if the "sentence" actually consists of multiple sentences (as determined by the
# NLTK Punkt Sentence Tokenizer), we want to repeat the below block per sentence and
# then pool across sentences
from nltk.tokenize import sent_tokenize
stripped = ''.join([i if ord(i) < 128 else '' for i in sentence])
sentences = sent_tokenize(stripped, language='english')
features_to_pool = defaultdict(list)
features = None
dlt_concat, left_corner_concat = None, None
for i, sub_sentence in enumerate(sentences):
features = Feature()
if dlt or left_corner:
# io.log(f'parsing into syntax tree: `{sentence}`')
# parsed = parse_input(sentence)
try:
server_url = f'http://localhost:{syntax_port}/fullberk'
features.tree = Tree(utils.compute_trees(sub_sentence, server_url=server_url))
if type(features.tree) is RuntimeError:
raise features.tree
getattr(features.tree, 'raw')
# print(parse_input(sentence), features.tree)
if dlt and features.tree.raw is not None:
# io.log(f'computing DLT feature')
dlt_stdout = utils.compute_feature('dlt.sh', features.tree.raw)
if type(dlt_stdout) == RuntimeError:
raise dlt_stdout
else:
features.dlt = DLT(dlt_stdout, sub_sentence, identifier)
# io.log(f'--- done: DLT')
if left_corner and features.tree.raw is not None:
# io.log(f'computing left corner feature')
left_corner_stdout = utils.compute_feature('leftcorner.sh', features.tree.raw)
if type(left_corner_stdout) == RuntimeError:
raise left_corner_stdout
else:
features.left_corner = LeftCorner(left_corner_stdout, sub_sentence, identifier)
# io.log(f'--- done: left corner')
features_to_pool['dlt'] += [features.dlt]
features_to_pool['left_corner'] += [features.left_corner]
except AttributeError as ae:
import traceback
io.log(f'FAILED: AttributeError while processing '
f'Tree [{features.tree}] features for chunk [{sub_sentence}] of sentence [{sentence}] '
f'traceback: {traceback.format_exc()}',
type='ERR')
# for attr in ['dlt', 'left_corner', 'tree']:
# io.log(f'hasattr(features, {attr}): {hasattr(features, attr)}', type='ERR')
# io.log(f'hasattr(features.tree, raw): {hasattr(features.tree, "raw")}', type='ERR')
pass
except RuntimeError:
io.log(f'FAILED: RuntimeError to process Tree features for chunk [{sub_sentence}] of sentence [{sentence}]', type='ERR')
pass
# do the groupby index and mean() here and then continue
if dlt:
try:
dlt_concat = pd.concat(features_to_pool['dlt'], axis='index')
dlt_concat = dlt_concat.groupby('index').mean()
dlt_concat['sentence'] = sentence
except ValueError:
dlt_concat = pd.DataFrame()
else:
dlt_concat = None
if left_corner:
try:
left_corner_concat = pd.concat(features_to_pool['left_corner'], axis='index')
left_corner_concat = left_corner_concat.groupby('index').mean()
left_corner_concat['sentence'] = sentence
except ValueError:
left_corner_concat = pd.DataFrame()
else:
left_corner_concat = None
tokenized = utils.tokenize(sub_sentence).split()
tagged_sentence = text.get_pos_tags(tokenized)
is_content_word = utils.get_is_content(tagged_sentence, content_pos=text.pos_for_content) # content or function word
pronoun_ratio = utils.get_pronoun_ratio(tagged_sentence)
content_ratio = utils.get_content_ratio(is_content_word)
return {
'index': identifier,
'sentence': sentence,
# 'tags': tagged_sentence,
# 'content_words': is_content_word,
'pronoun_ratio': pronoun_ratio,
'content_ratio': content_ratio,
# 'tree': features.tree
'dlt': dlt_concat,
'leftcorner': left_corner_concat,
}
Sub-modules
sentspace.syntax.features
sentspace.syntax.utils
Functions
def get_features(sentence: str = None, identifier: str = None, dlt: bool = True, left_corner: bool = True, syntax_port: int = 8000) ‑> dict
-
executes the syntactic features pipeline
Args
sentence
:str
, optional- exactly 1 sentence [None].
dlt
:bool
, optional- calculate DLT feature? [False].
left_corner
:bool
, optional- calculate Left Corner feature? [False].
Returns
Feature
- a Feature instance with appropriate attributes
Expand source code
def get_features(sentence: str = None, identifier: str = None, dlt: bool = True, left_corner: bool = True, syntax_port: int = 8000) -> dict: """executes the syntactic features pipeline Args: sentence (str, optional): exactly 1 sentence [None]. dlt (bool, optional): calculate DLT feature? [False]. left_corner (bool, optional): calculate Left Corner feature? [False]. Returns: sentspace.syntax.features.Feature: a Feature instance with appropriate attributes """ # if the "sentence" actually consists of multiple sentences (as determined by the # NLTK Punkt Sentence Tokenizer), we want to repeat the below block per sentence and # then pool across sentences from nltk.tokenize import sent_tokenize stripped = ''.join([i if ord(i) < 128 else '' for i in sentence]) sentences = sent_tokenize(stripped, language='english') features_to_pool = defaultdict(list) features = None dlt_concat, left_corner_concat = None, None for i, sub_sentence in enumerate(sentences): features = Feature() if dlt or left_corner: # io.log(f'parsing into syntax tree: `{sentence}`') # parsed = parse_input(sentence) try: server_url = f'http://localhost:{syntax_port}/fullberk' features.tree = Tree(utils.compute_trees(sub_sentence, server_url=server_url)) if type(features.tree) is RuntimeError: raise features.tree getattr(features.tree, 'raw') # print(parse_input(sentence), features.tree) if dlt and features.tree.raw is not None: # io.log(f'computing DLT feature') dlt_stdout = utils.compute_feature('dlt.sh', features.tree.raw) if type(dlt_stdout) == RuntimeError: raise dlt_stdout else: features.dlt = DLT(dlt_stdout, sub_sentence, identifier) # io.log(f'--- done: DLT') if left_corner and features.tree.raw is not None: # io.log(f'computing left corner feature') left_corner_stdout = utils.compute_feature('leftcorner.sh', features.tree.raw) if type(left_corner_stdout) == RuntimeError: raise left_corner_stdout else: features.left_corner = LeftCorner(left_corner_stdout, sub_sentence, identifier) # io.log(f'--- done: left corner') features_to_pool['dlt'] += [features.dlt] features_to_pool['left_corner'] += [features.left_corner] except AttributeError as ae: import traceback io.log(f'FAILED: AttributeError while processing ' f'Tree [{features.tree}] features for chunk [{sub_sentence}] of sentence [{sentence}] ' f'traceback: {traceback.format_exc()}', type='ERR') # for attr in ['dlt', 'left_corner', 'tree']: # io.log(f'hasattr(features, {attr}): {hasattr(features, attr)}', type='ERR') # io.log(f'hasattr(features.tree, raw): {hasattr(features.tree, "raw")}', type='ERR') pass except RuntimeError: io.log(f'FAILED: RuntimeError to process Tree features for chunk [{sub_sentence}] of sentence [{sentence}]', type='ERR') pass # do the groupby index and mean() here and then continue if dlt: try: dlt_concat = pd.concat(features_to_pool['dlt'], axis='index') dlt_concat = dlt_concat.groupby('index').mean() dlt_concat['sentence'] = sentence except ValueError: dlt_concat = pd.DataFrame() else: dlt_concat = None if left_corner: try: left_corner_concat = pd.concat(features_to_pool['left_corner'], axis='index') left_corner_concat = left_corner_concat.groupby('index').mean() left_corner_concat['sentence'] = sentence except ValueError: left_corner_concat = pd.DataFrame() else: left_corner_concat = None tokenized = utils.tokenize(sub_sentence).split() tagged_sentence = text.get_pos_tags(tokenized) is_content_word = utils.get_is_content(tagged_sentence, content_pos=text.pos_for_content) # content or function word pronoun_ratio = utils.get_pronoun_ratio(tagged_sentence) content_ratio = utils.get_content_ratio(is_content_word) return { 'index': identifier, 'sentence': sentence, # 'tags': tagged_sentence, # 'content_words': is_content_word, 'pronoun_ratio': pronoun_ratio, 'content_ratio': content_ratio, # 'tree': features.tree 'dlt': dlt_concat, 'leftcorner': left_corner_concat, }