Source code for nvm.aux_spacy.factories.get_doc_basic_metrics

#!/usr/bin/env python3

import logging
from spacy.language import Language
from spacy.attrs import IS_ALPHA
from spacy.tokens import Doc, Token

from ..set_container_extensions import set_container_extensions_from_dict


[docs]@Language.factory( "get_doc_basic_metrics", default_config={ "log0": logging.getLogger("dummy"), }, ) def get_doc_basic_metrics_component( nlp: Language, name: str, log0: logging.Logger, ): """Get Doc basic metrics. Examples -------- >>> import spacy >>> from dframcy import DframCy >>> >>> from nvm import disp_df >>> from nvm.aux_spacy import get_doc_basic_metrics_component >>> >>> nlp = spacy.load("en_core_web_sm") >>> nlp.add_pipe("get_doc_basic_metrics", "BASIC") >>> >>> dframcy = DframCy(nlp) >>> >>> doc = dframcy.nlp( >>> "This sentence contains two verbs and this is how many verbs should be found." >>> ) >>> >>> df0 = dframcy.to_dataframe( >>> doc, >>> columns=["text", "lemma_", "is_alpha", "pos_", "tag_", "is_sent_start"], >>> custom_attributes=tok_exts[:12], >>> ) >>> disp_df(df0) """ return DocBasicMetricsComponent( nlp=nlp, log0=log0, )
[docs]class DocBasicMetricsComponent: """DocBasicMetricsComponent. Methods ------- __call__: some description """ def __init__( self, nlp: Language, log0: logging.Logger = logging.getLogger("dummy"), ): """DocBasicMetricsComponent.""" # Placeholder dictionaries for new functions self.tok_fn_dict = dict() self.doc_fn_dict = dict() # Verb in base form (VB) self.tok_fn_dict["is_VB"] = lambda token: (token.tag_ == "VB") & ( token.is_alpha ) self.tok_fn_dict["is_VB_without_be_and_have"] = ( lambda token: (token.tag_ == "VB") & (token.is_alpha) & (token.lemma_ not in ["be", "have"]) ) def WORD_count(doc): """Get word count for doc (based on IS_ALPHA attribute).""" alpha_tokens_count = doc.count_by(IS_ALPHA) word_count = ( alpha_tokens_count[1] if (1 in alpha_tokens_count.keys()) else 0 ) return word_count self.doc_fn_dict["WORD_count"] = WORD_count def NOUN_count(doc): """Get noun count for Doc (based on POS attribute).""" # _counts = doc.count_by(POS) # OPEN: efficiency # _noun_key = doc.vocab.strings["NOUN"] _counts = len([tk for tk in doc if (tk.pos_ == "NOUN") & (tk.is_alpha)]) # return _counts[_noun_key] if _noun_key in _counts.keys() else 0 return _counts self.doc_fn_dict["NOUN_count"] = NOUN_count def ADJ_count(doc): """Get adjective count for Doc (based on POS attribute).""" # _counts = doc.count_by(POS) # OPEN: efficiency # _adjective_key = doc.vocab.strings["ADJ"] _counts = len([tk for tk in doc if (tk.pos_ == "ADJ") & (tk.is_alpha)]) # return _counts[_adjective_key] if _adjective_key in _counts.keys() else 0 return _counts self.doc_fn_dict["ADJ_count"] = ADJ_count def VERB_count(doc): """Get verb count for Doc (based on POS attribute).""" # _counts = doc.count_by(POS) # _verb_key = doc.vocab.strings["VERB"] _counts = len([tk for tk in doc if (tk.pos_ == "VERB") & (tk.is_alpha)]) # return _counts[_verb_key] if _verb_key in _counts.keys() else 0 return _counts self.doc_fn_dict["VERB_count"] = VERB_count def VERB_count_without_be_and_have(doc): """Get verb count for doc (based on POS attribute) but exclude "be" and "have". """ _counts = len( [ tk for tk in doc if (tk.pos_ == "VERB") & (tk.is_alpha) & (tk.lemma_ not in ["be", "have"]) ] ) return _counts self.doc_fn_dict[ "VERB_count_without_be_and_have" ] = VERB_count_without_be_and_have def VB_count(doc): """Get VB count for spacy.Doc (using TAG attribute).""" _counts = len([tk for tk in doc if (tk.tag_ == "VB") & (tk.is_alpha)]) return _counts self.doc_fn_dict["VB_count"] = VB_count def VB_count_without_be_and_have(doc): """Get VB count for spacy.Doc (using TAG attribute) but exclude "be" and "have". """ _counts = len( [ tk for tk in doc if (tk.tag_ == "VB") & (tk.is_alpha) & (tk.lemma_ not in ["be", "have"]) ] ) return _counts self.doc_fn_dict["VB_count_without_be_and_have"] = VB_count_without_be_and_have def JJ_count(doc): """Get JJ count for spacy.Doc (using TAG attribute).""" _counts = len([tk for tk in doc if (tk.tag_ == "JJ") & (tk.is_alpha)]) return _counts self.doc_fn_dict["JJ_count"] = JJ_count def JJRs_count(doc): """Get JJR count for spacy.Doc (using TAG attribute).""" _counts = len([tk for tk in doc if (tk.tag_ == "JJR") & (tk.is_alpha)]) return _counts self.doc_fn_dict["JJRs_count"] = JJRs_count def JJSs_count(doc): """Get JJS count for spacy.Doc (using TAG attribute).""" _counts = len([tk for tk in doc if (tk.tag_ == "JJS") & (tk.is_alpha)]) return _counts self.doc_fn_dict["JJSs_count"] = JJSs_count # Update Token and Doc extensions. set_container_extensions_from_dict(Token, self.tok_fn_dict, log0=log0) set_container_extensions_from_dict(Doc, self.doc_fn_dict, log0=log0) def __call__(self, doc: Doc) -> Doc: """DocBasicMetricsComponent.""" return doc