diff --git a/waifu/datasets/kajiwoto.py b/waifu/datasets/kajiwoto.py index 78fb0c1..039573e 100644 --- a/waifu/datasets/kajiwoto.py +++ b/waifu/datasets/kajiwoto.py @@ -12,6 +12,9 @@ from waifu.utils.dataset import get_data_path # The regex used to find message variants (e.g.: `%{Hi|Hello} there!`) KAJIWOTO_VARIANT_REGEX = re.compile(r'%{(.+?)}') +# These bots shouldn't be a part of the final dataset, for whatever reason. +BLACKLISTED_BOT_IDS = set(["WvqA"]) + logger = logging.getLogger(__name__) @@ -262,6 +265,10 @@ def _enumerate_kajiwoto_json_files() -> list[str]: # Don't want to list metadata files here. continue + if item.replace(".json", "") in BLACKLISTED_BOT_IDS: + # Don't want blacklisted bots being included. + continue + item_path = os.path.join(dataset_path, item) if not os.path.isfile(item_path): # Don't care about folders. diff --git a/waifu/modules/discord_vdm.py b/waifu/modules/discord_vdm.py index 264dfce..3b31051 100644 --- a/waifu/modules/discord_vdm.py +++ b/waifu/modules/discord_vdm.py @@ -23,6 +23,8 @@ from waifu.utils.dataset import get_data_path # Matches user mentions, channel links, emotes and maybe other stuff. SPECIAL_TOKENS_REGEX = re.compile(r"<[@:#].+?>") +MINIMUM_EPISODE_LENGTH = 5 + logger = logging.getLogger(__name__) @@ -45,9 +47,10 @@ class DiscordVDM(BaseModule): turns, last_message_id = episode_contents # Discard short episodes. - if len(turns) < 8: - logger.debug("Found short %s-turn episode, discarding.", - len(turns)) + if len(turns) < MINIMUM_EPISODE_LENGTH: + logger.debug( + "Found short %s-turn episode (< %s), discarding.", + len(turns), MINIMUM_EPISODE_LENGTH) continue # Discard conversations with overly short messages. @@ -194,7 +197,8 @@ def _build_episode_turns( continue if _looks_like_ooc(cleaned_text): - # Self-explanatory. + logger.debug("Dropping what _seems_ to be OOC talk: `%s`", + cleaned_text) continue # Get username. diff --git a/waifu/scripts/build_dataset.py b/waifu/scripts/build_dataset.py index e6b4e97..cf9a22a 100755 --- a/waifu/scripts/build_dataset.py +++ b/waifu/scripts/build_dataset.py @@ -10,6 +10,7 @@ import sys import typing as t from waifu.modules import BaseModule +from waifu.utils.strings import contains_suspect_unicode # TODO(11b): Needs manual maintenance to keep up-to-date. Consider doing some # metaprogramming trickery to build this list out instead. @@ -118,6 +119,8 @@ def main() -> None: # file. for module in modules: for episode in module(): + if contains_suspect_unicode(episode): + print(f"Found suspect unicode contents in `{episode}`") json_line = json.dumps({"text": episode}) output_file.write(f"{json_line}\n") diff --git a/waifu/utils/strings.py b/waifu/utils/strings.py index eb92fca..0674cc5 100644 --- a/waifu/utils/strings.py +++ b/waifu/utils/strings.py @@ -2,6 +2,7 @@ # Some of this is pasta from Meta's ParlAI. See: # https://github.com/facebookresearch/ParlAI/blob/main/parlai/utils/strings.py +import regex def normalize_string(text: str, version: int = 1) -> str: @@ -54,9 +55,18 @@ def title_case(string: str) -> str: def uppercase(string: str) -> str: ''' - Make the first character of the string uppercase, if the string is non-empty. + Makes the first character of the string uppercase, if the string is + non-empty. ''' if len(string) == 0: return string else: return string[0].upper() + string[1:] + + +def contains_suspect_unicode(string: str) -> bool: + ''' + Returns whether the given string seems to have suspect Unicode trickery + (e.g.: Zalgo text). + ''' + return regex.search(r"\pM{3,}", string) is not None