feat: changes to log and discard some not-so-great data

2023-01-01 11:34:31 -03:00 · 2023-01-01 11:34:31 -03:00 · e4594338d2
parent 9f55ecfca7
commit e4594338d2
4 changed files with 29 additions and 5 deletions
--- a/waifu/datasets/kajiwoto.py
+++ b/waifu/datasets/kajiwoto.py
@ -12,6 +12,9 @@ from waifu.utils.dataset import get_data_path
 # The regex used to find message variants (e.g.: `%{Hi|Hello} there!`)
 KAJIWOTO_VARIANT_REGEX = re.compile(r'%{(.+?)}')

+# These bots shouldn't be a part of the final dataset, for whatever reason.
+BLACKLISTED_BOT_IDS = set(["WvqA"])
+
 logger = logging.getLogger(__name__)


@ -262,6 +265,10 @@ def _enumerate_kajiwoto_json_files() -> list[str]:
            # Don't want to list metadata files here.
            continue

+        if item.replace(".json", "") in BLACKLISTED_BOT_IDS:
+            # Don't want blacklisted bots being included.
+            continue
+
        item_path = os.path.join(dataset_path, item)
        if not os.path.isfile(item_path):
            # Don't care about folders.
--- a/waifu/modules/discord_vdm.py
+++ b/waifu/modules/discord_vdm.py
@ -23,6 +23,8 @@ from waifu.utils.dataset import get_data_path
 # Matches user mentions, channel links, emotes and maybe other stuff.
 SPECIAL_TOKENS_REGEX = re.compile(r"<[@:#].+?>")

+MINIMUM_EPISODE_LENGTH = 5
+
 logger = logging.getLogger(__name__)


@ -45,9 +47,10 @@ class DiscordVDM(BaseModule):
                turns, last_message_id = episode_contents

                # Discard short episodes.
-                if len(turns) < 8:
-                    logger.debug("Found short %s-turn episode, discarding.",
-                                 len(turns))
+                if len(turns) < MINIMUM_EPISODE_LENGTH:
+                    logger.debug(
+                        "Found short %s-turn episode (< %s), discarding.",
+                        len(turns), MINIMUM_EPISODE_LENGTH)
                    continue

                # Discard conversations with overly short messages.
@ -194,7 +197,8 @@ def _build_episode_turns(
            continue

        if _looks_like_ooc(cleaned_text):
-            # Self-explanatory.
+            logger.debug("Dropping what _seems_ to be OOC talk: `%s`",
+                         cleaned_text)
            continue

        # Get username.
--- a/waifu/scripts/build_dataset.py
+++ b/waifu/scripts/build_dataset.py
@ -10,6 +10,7 @@ import sys
 import typing as t

 from waifu.modules import BaseModule
+from waifu.utils.strings import contains_suspect_unicode

 # TODO(11b): Needs manual maintenance to keep up-to-date. Consider doing some
 # metaprogramming trickery to build this list out instead.
@ -118,6 +119,8 @@ def main() -> None:
        # file.
        for module in modules:
            for episode in module():
+                if contains_suspect_unicode(episode):
+                    print(f"Found suspect unicode contents in `{episode}`")
                json_line = json.dumps({"text": episode})
                output_file.write(f"{json_line}\n")

--- a/waifu/utils/strings.py
+++ b/waifu/utils/strings.py
@ -2,6 +2,7 @@

 # Some of this is pasta from Meta's ParlAI. See:
 # https://github.com/facebookresearch/ParlAI/blob/main/parlai/utils/strings.py
+import regex


 def normalize_string(text: str, version: int = 1) -> str:
@ -54,9 +55,18 @@ def title_case(string: str) -> str:

 def uppercase(string: str) -> str:
    '''
-    Make the first character of the string uppercase, if the string is non-empty.
+    Makes the first character of the string uppercase, if the string is
+    non-empty.
    '''
    if len(string) == 0:
        return string
    else:
        return string[0].upper() + string[1:]
+
+
+def contains_suspect_unicode(string: str) -> bool:
+    '''
+    Returns whether the given string seems to have suspect Unicode trickery
+    (e.g.: Zalgo text).
+    '''
+    return regex.search(r"\pM{3,}", string) is not None