Source code for nvm.aux_str.aux_str

#!/usr/bin/env python3


import re
from typing import (
    List,
    Dict,
    Pattern,
    Union,
)

from .clean_str_mappings import (  # noqa: F401
    CLEAN_STR_MAPPINGS_DROP_HASHTAGS,
    CLEAN_STR_MAPPINGS_TINY,
    CLEAN_STR_MAPPINGS_LARGE,
    CLEAN_STR_MAPPINGS_HUGE,
    CLEAN_STR_MAPPINGS_SPACE,
)


[docs]def is_ascii(s: str) -> bool:
    """Check if the characters in string s are in ASCII.

    Parameters
    ----------
    s : str
        String to be checked if it contains only ASCII characters.

    Returns
    -------
    bool
        ``True`` if ``s`` contains only ASCII characters.

    Examples
    --------

    >>> from nvm.aux_str import is_ascii
    >>> assert is_ascii("abc 123")
    >>> assert not is_ascii("abc 123 ×")
    >>> assert not is_ascii("abc 123 ")

    """
    return all(ord(c) < 128 for c in s)


[docs]def is_ascii_alt(s: str) -> bool:
    """Check if the characters in string s are in ASCII, U+0-U+7F.

    Parameters
    ----------
    s : str
        String to be checked if it contains only ASCII characters.


    Returns
    -------
    bool
        ``True`` if ``s`` contains only ASCII characters.


    Examples
    --------

    >>> from nvm.aux_str import is_ascii_alt
    >>> assert is_ascii_alt("abc 123")
    >>> assert not is_ascii_alt("abc 123 ×")
    >>> assert not is_ascii_alt("abc 123 ")

    """
    return len(s) == len(s.encode())


[docs]def clean_str(
    text: str,
    mappings: List[Dict[str, List[Union[str, Pattern[str]]]]] = CLEAN_STR_MAPPINGS_TINY,
) -> str:
    """Clean string replacing any unwanted text with the desired.

    This function can be used to clean text from redundant whitespace characters
    and other common problems.

    Parameters
    ----------
    text : str
        Text to be cleaned.

    mappings : List[Dict[str, List[Union[str, Pattern[str]]]]], default=[{' ': ['\\n', '\\r', '\\t']}, {'-': ['−', '–', '—', '―', '﹣', '－']}]
        List of mappings to be used for text cleaning. This should be a list of
        dictionaries. Dictionary keys should contain strings that are used as
        replacement for matches of string patterns or regexes provided as list
        in dictionary key value. The default value is sourced from ``nvm.aux_str.clean_str_mappings.CLEAN_STR_MAPPINGS_TINY``.

    Returns
    -------
    str
        Clean text.

    Examples
    --------
    To clean a string use:

    >>> from nvm.aux_str import clean_str
    >>> text_dirty = "  one two  three\\t \\n\\n\\r four...  "
    >>> text_clean = clean_str(text=text_dirty)
    >>> # print(text_dirty)
    >>> print(text_clean)
    "one two three four..."

    This function can be applied to pandas dataframe column, for example:

    >>> # let df0 be a dataframe that contains text column "text"
    >>> # to clean its content in place we may run
    >>> text_field = "text"
    >>> df0[text_field] = df0[text_field].apply(clean_str)

    .. role:: python(code)
        :language: python


    The ``mappings`` argument should be a list of dictionaries that define
    string pattern- or regex-based replacements used for text cleaning.
    Dictionary keys should contain strings that are used as replacement for
    matches of patterns provided as list in corresponding (dictionary key) value
    (:python:`List[Dict[str, List[Union[str, Pattern[str]]]]]`).

    For example, to replace all occurrences of
    LF (Line Feed, ``"\\n"``),
    CR (Carriage Return, ``"\\r"``) and
    HT (Horizontal Tab, ``"\\t"``) with
    ``" "`` (space), as well as,
    replace all occurrences of some dash-like characters with ``"-"``,
    the following mapping can be used:

    >>> mappings = [
    >>>     {
    >>>         " ": [  # Unicode Character 'SPACE' (U+0020)
    >>>             "\\n",  # LF (Line Feed)
    >>>             "\\r",  # CR (Carriage Return)
    >>>             "\\t",  # HT (Horizontal Tab)
    >>>         ],
    >>>     },
    >>>     {
    >>>         "-": [  # Unicode Character 'HYPHEN-MINUS' (U+002D) # chr(45) ord("-") ord("\u002D")
    >>>             "\\u2212",  # Unicode Character 'MINUS SIGN' (U+2212)
    >>>             "\\u2013",  # Unicode Character 'EN DASH' (U+2013) # chr(8211) ↔ ord("–") ↔ ord("\u2013")
    >>>             "\\u2014",  # Unicode Character 'EM DASH' (U+2014)
    >>>             "\\u2015",  # Unicode Character 'HORIZONTAL BAR' (U+2015)
    >>>             "\\uFE63",  # Unicode Character 'SMALL HYPHEN-MINUS' (U+FE63)
    >>>             "\\uFF0D",  # Unicode Character 'FULLWIDTH HYPHEN-MINUS' (U+FF0D)
    >>>         ],
    >>>     },
    >>> ]


    .. note::
        **Hint:** an empty string can be used to remove text matching a regex, for
        example:

        >>> mappings = [{"": [re.compile(r"[0-9]")]}]  # remove digits


    :python:`nvm.aux_str` also provides few usefull mappings:

    >>> # Import example mappings:
    >>> from nvm.aux_str import CLEAN_STR_MAPPINGS_TINY
    >>> from nvm.aux_str import CLEAN_STR_MAPPINGS_LARGE
    >>> from nvm.aux_str import CLEAN_STR_MAPPINGS_HUGE
    >>> from nvm.aux_str import CLEAN_STR_MAPPINGS_SPACE
    >>> from nvm.aux_str import CLEAN_STR_MAPPINGS_DROP_HASHTAGS
    >>> # Display sample mapping as JSON:
    >>> import srsly
    >>> print(srsly.json_dumps(CLEAN_STR_MAPPINGS_TINY, indent=2))
    [
      {
        " ":[
          "\\n",
          "\\r",
          "\\t"
        ]
      },
      {
        "-":[
          "\\u2212",
          "\\u2013",
          "\\u2014",
          "\\u2015",
          "\\ufe63",
          "\\uff0d"
        ]
      }
    ]


    Note that we used |json_dumps|_ function from the |srsly|_ library
    to get indented JSON output.

    Drop hashtags

    >>> from nvm.aux_str import CLEAN_STR_MAPPINGS_DROP_HASHTAGS as map0
    >>> from nvm.aux_str import clean_str
    >>> text_dirty = "  #one\\ntwo\\n\\tthree #3443 #three434 #44ok \\t #four... five #hashTag comose text"
    >>> text_clean = clean_str(text=text_dirty, mappings=map0)
    >>> # print(text_dirty)
    >>> print(text_clean)
    "two three #3443 ... five comose text"


    .. |srsly| replace:: ``srsly``
    .. _srsly: https://github.com/explosion/srsly

    .. |json_dumps| replace:: ``json_dumps``
    .. _json_dumps: https://github.com/explosion/srsly/blob/136eb677604e65fd4f00ce9594c6f558b1fc2d3c/srsly/_json_api.py#L10  ## noqa: E501

    """
    # make sure that the text input is str
    text = str(text)
    # substitute each undesired str with the desired one
    for item in mappings:
        for key, val in item.items():
            for pattern in val:
                text = re.sub(pattern, key, text)

    # Finally remove repeated whitespace characters
    text = re.sub(r"\s\s+", " ", text)
    # and strip whitespace at the beginning and end of the output
    text = text.strip()
    return text


def _temp_test_awkward_mappings():
    # mappings = [{"a": list("ABC")}, {"x": list("XYZ")}]
    mappings = [{"a": list("ABC"), "x": list("XYZ")}]
    for item in mappings:
        for key, val in item.items():
            for pattern in val:
                print(pattern, key)