Source code for nvm.aux_spacy.factories.get_doc_sentences

#!/usr/bin/env python3

import logging
from spacy.tokens import Doc
from spacy.language import Language

from ..set_container_extensions import set_container_extensions_from_dict


[docs]@Language.factory( "get_doc_sentences_as_list", default_config={ "log0": logging.getLogger("dummy"), }, ) def get_doc_sentences_as_list_component( nlp: Language, name: str, log0: logging.Logger, ): """Get document sentences as a list. Examples -------- >>> import spacy >>> from nvm.aux_spacy import get_doc_sentences_as_list_component >>> # nlp = spacy.blank("en") >>> nlp = spacy.load("en_core_web_sm") >>> nlp.add_pipe("get_doc_sentences_as_list", "SENTS") >>> doc = nlp("This is the first sentence. This is the second sentence.") >>> assert len(doc._.sents) == 2 >>> doc._.sents ['This is the first sentence.', 'This is the second sentence.'] """ return DocSentsAsListComponent( nlp=nlp, log0=log0, )
[docs]class DocSentsAsListComponent: def __init__( self, nlp: Language, log0: logging.Logger = logging.getLogger("dummy"), ): # Language, at least nlp = spacy.load("en_core_web_sm") nlp.enable_pipe("senter") # Placeholder dictionary for new functions self.doc_fn_dict = dict() # Function def sents_as_list(doc): """Get sentences as list of strings.""" sents = [sent.text for sent in doc.sents] return sents # Add function do dictionary self.doc_fn_dict["sents"] = sents_as_list # Update Doc extensions. set_container_extensions_from_dict(Doc, self.doc_fn_dict, log0=log0) def __call__(self, doc: Doc) -> Doc: return doc