feat: changes to log and discard some not-so-great data

2023-01-01 11:34:31 -03:00 · 2023-01-01 11:34:31 -03:00 · e4594338d2
parent 9f55ecfca7
commit e4594338d2
4 changed files with 29 additions and 5 deletions
--- a/waifu/datasets/kajiwoto.py
+++ b/waifu/datasets/kajiwoto.py
@ -12,6 +12,9 @@ from waifu.utils.dataset import get_data_path
 # The regex used to find message variants (e.g.: `%{Hi|Hello} there!`)
 KAJIWOTO_VARIANT_REGEX = re.compile(r'%{(.+?)}')
 # These bots shouldn't be a part of the final dataset, for whatever reason.
 BLACKLISTED_BOT_IDS = set(["WvqA"])
 logger = logging.getLogger(__name__)
@ -262,6 +265,10 @@ def _enumerate_kajiwoto_json_files() -> list[str]:
            # Don't want to list metadata files here.
            continue
        if item.replace(".json", "") in BLACKLISTED_BOT_IDS:
            # Don't want blacklisted bots being included.
            continue
        item_path = os.path.join(dataset_path, item)
        if not os.path.isfile(item_path):
            # Don't care about folders.
--- a/waifu/modules/discord_vdm.py
+++ b/waifu/modules/discord_vdm.py
@ -23,6 +23,8 @@ from waifu.utils.dataset import get_data_path
 # Matches user mentions, channel links, emotes and maybe other stuff.
 SPECIAL_TOKENS_REGEX = re.compile(r"<[@:#].+?>")
 MINIMUM_EPISODE_LENGTH = 5
 logger = logging.getLogger(__name__)
@ -45,9 +47,10 @@ class DiscordVDM(BaseModule):
                turns, last_message_id = episode_contents
                # Discard short episodes.
-                if len(turns) < 8:
+                if len(turns) < MINIMUM_EPISODE_LENGTH:
-                    logger.debug("Found short %s-turn episode, discarding.",
+                    logger.debug(
-                                 len(turns))
+                        "Found short %s-turn episode (< %s), discarding.",
                        len(turns), MINIMUM_EPISODE_LENGTH)
                    continue
                # Discard conversations with overly short messages.
@ -194,7 +197,8 @@ def _build_episode_turns(
            continue
        if _looks_like_ooc(cleaned_text):
-            # Self-explanatory.
+            logger.debug("Dropping what _seems_ to be OOC talk: `%s`",
                         cleaned_text)
            continue
        # Get username.
--- a/waifu/scripts/build_dataset.py
+++ b/waifu/scripts/build_dataset.py
@ -10,6 +10,7 @@ import sys
 import typing as t
 from waifu.modules import BaseModule
 from waifu.utils.strings import contains_suspect_unicode
 # TODO(11b): Needs manual maintenance to keep up-to-date. Consider doing some
 # metaprogramming trickery to build this list out instead.
@ -118,6 +119,8 @@ def main() -> None:
        # file.
        for module in modules:
            for episode in module():
                if contains_suspect_unicode(episode):
                    print(f"Found suspect unicode contents in `{episode}`")
                json_line = json.dumps({"text": episode})
                output_file.write(f"{json_line}\n")
--- a/waifu/utils/strings.py
+++ b/waifu/utils/strings.py
@ -2,6 +2,7 @@
 # Some of this is pasta from Meta's ParlAI. See:
 # https://github.com/facebookresearch/ParlAI/blob/main/parlai/utils/strings.py
 import regex
 def normalize_string(text: str, version: int = 1) -> str:
@ -54,9 +55,18 @@ def title_case(string: str) -> str:
 def uppercase(string: str) -> str:
    '''
-    Make the first character of the string uppercase, if the string is non-empty.
+    Makes the first character of the string uppercase, if the string is
    non-empty.
    '''
    if len(string) == 0:
        return string
    else:
        return string[0].upper() + string[1:]
 def contains_suspect_unicode(string: str) -> bool:
    '''
    Returns whether the given string seems to have suspect Unicode trickery
    (e.g.: Zalgo text).
    '''
    return regex.search(r"\pM{3,}", string) is not None