feat: changes to log and discard some not-so-great data

This commit is contained in:
11b 2023-01-01 11:34:31 -03:00
parent 9f55ecfca7
commit e4594338d2
4 changed files with 29 additions and 5 deletions

View File

@ -12,6 +12,9 @@ from waifu.utils.dataset import get_data_path
# The regex used to find message variants (e.g.: `%{Hi|Hello} there!`)
KAJIWOTO_VARIANT_REGEX = re.compile(r'%{(.+?)}')
# These bots shouldn't be a part of the final dataset, for whatever reason.
BLACKLISTED_BOT_IDS = set(["WvqA"])
logger = logging.getLogger(__name__)
@ -262,6 +265,10 @@ def _enumerate_kajiwoto_json_files() -> list[str]:
# Don't want to list metadata files here.
continue
if item.replace(".json", "") in BLACKLISTED_BOT_IDS:
# Don't want blacklisted bots being included.
continue
item_path = os.path.join(dataset_path, item)
if not os.path.isfile(item_path):
# Don't care about folders.

View File

@ -23,6 +23,8 @@ from waifu.utils.dataset import get_data_path
# Matches user mentions, channel links, emotes and maybe other stuff.
SPECIAL_TOKENS_REGEX = re.compile(r"<[@:#].+?>")
MINIMUM_EPISODE_LENGTH = 5
logger = logging.getLogger(__name__)
@ -45,9 +47,10 @@ class DiscordVDM(BaseModule):
turns, last_message_id = episode_contents
# Discard short episodes.
if len(turns) < 8:
logger.debug("Found short %s-turn episode, discarding.",
len(turns))
if len(turns) < MINIMUM_EPISODE_LENGTH:
logger.debug(
"Found short %s-turn episode (< %s), discarding.",
len(turns), MINIMUM_EPISODE_LENGTH)
continue
# Discard conversations with overly short messages.
@ -194,7 +197,8 @@ def _build_episode_turns(
continue
if _looks_like_ooc(cleaned_text):
# Self-explanatory.
logger.debug("Dropping what _seems_ to be OOC talk: `%s`",
cleaned_text)
continue
# Get username.

View File

@ -10,6 +10,7 @@ import sys
import typing as t
from waifu.modules import BaseModule
from waifu.utils.strings import contains_suspect_unicode
# TODO(11b): Needs manual maintenance to keep up-to-date. Consider doing some
# metaprogramming trickery to build this list out instead.
@ -118,6 +119,8 @@ def main() -> None:
# file.
for module in modules:
for episode in module():
if contains_suspect_unicode(episode):
print(f"Found suspect unicode contents in `{episode}`")
json_line = json.dumps({"text": episode})
output_file.write(f"{json_line}\n")

View File

@ -2,6 +2,7 @@
# Some of this is pasta from Meta's ParlAI. See:
# https://github.com/facebookresearch/ParlAI/blob/main/parlai/utils/strings.py
import regex
def normalize_string(text: str, version: int = 1) -> str:
@ -54,9 +55,18 @@ def title_case(string: str) -> str:
def uppercase(string: str) -> str:
'''
Make the first character of the string uppercase, if the string is non-empty.
Makes the first character of the string uppercase, if the string is
non-empty.
'''
if len(string) == 0:
return string
else:
return string[0].upper() + string[1:]
def contains_suspect_unicode(string: str) -> bool:
'''
Returns whether the given string seems to have suspect Unicode trickery
(e.g.: Zalgo text).
'''
return regex.search(r"\pM{3,}", string) is not None