feat: changes to log and discard some not-so-great data
This commit is contained in:
parent
9f55ecfca7
commit
e4594338d2
|
@ -12,6 +12,9 @@ from waifu.utils.dataset import get_data_path
|
|||
# The regex used to find message variants (e.g.: `%{Hi|Hello} there!`)
|
||||
KAJIWOTO_VARIANT_REGEX = re.compile(r'%{(.+?)}')
|
||||
|
||||
# These bots shouldn't be a part of the final dataset, for whatever reason.
|
||||
BLACKLISTED_BOT_IDS = set(["WvqA"])
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
@ -262,6 +265,10 @@ def _enumerate_kajiwoto_json_files() -> list[str]:
|
|||
# Don't want to list metadata files here.
|
||||
continue
|
||||
|
||||
if item.replace(".json", "") in BLACKLISTED_BOT_IDS:
|
||||
# Don't want blacklisted bots being included.
|
||||
continue
|
||||
|
||||
item_path = os.path.join(dataset_path, item)
|
||||
if not os.path.isfile(item_path):
|
||||
# Don't care about folders.
|
||||
|
|
|
@ -23,6 +23,8 @@ from waifu.utils.dataset import get_data_path
|
|||
# Matches user mentions, channel links, emotes and maybe other stuff.
|
||||
SPECIAL_TOKENS_REGEX = re.compile(r"<[@:#].+?>")
|
||||
|
||||
MINIMUM_EPISODE_LENGTH = 5
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
@ -45,9 +47,10 @@ class DiscordVDM(BaseModule):
|
|||
turns, last_message_id = episode_contents
|
||||
|
||||
# Discard short episodes.
|
||||
if len(turns) < 8:
|
||||
logger.debug("Found short %s-turn episode, discarding.",
|
||||
len(turns))
|
||||
if len(turns) < MINIMUM_EPISODE_LENGTH:
|
||||
logger.debug(
|
||||
"Found short %s-turn episode (< %s), discarding.",
|
||||
len(turns), MINIMUM_EPISODE_LENGTH)
|
||||
continue
|
||||
|
||||
# Discard conversations with overly short messages.
|
||||
|
@ -194,7 +197,8 @@ def _build_episode_turns(
|
|||
continue
|
||||
|
||||
if _looks_like_ooc(cleaned_text):
|
||||
# Self-explanatory.
|
||||
logger.debug("Dropping what _seems_ to be OOC talk: `%s`",
|
||||
cleaned_text)
|
||||
continue
|
||||
|
||||
# Get username.
|
||||
|
|
|
@ -10,6 +10,7 @@ import sys
|
|||
import typing as t
|
||||
|
||||
from waifu.modules import BaseModule
|
||||
from waifu.utils.strings import contains_suspect_unicode
|
||||
|
||||
# TODO(11b): Needs manual maintenance to keep up-to-date. Consider doing some
|
||||
# metaprogramming trickery to build this list out instead.
|
||||
|
@ -118,6 +119,8 @@ def main() -> None:
|
|||
# file.
|
||||
for module in modules:
|
||||
for episode in module():
|
||||
if contains_suspect_unicode(episode):
|
||||
print(f"Found suspect unicode contents in `{episode}`")
|
||||
json_line = json.dumps({"text": episode})
|
||||
output_file.write(f"{json_line}\n")
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
# Some of this is pasta from Meta's ParlAI. See:
|
||||
# https://github.com/facebookresearch/ParlAI/blob/main/parlai/utils/strings.py
|
||||
import regex
|
||||
|
||||
|
||||
def normalize_string(text: str, version: int = 1) -> str:
|
||||
|
@ -54,9 +55,18 @@ def title_case(string: str) -> str:
|
|||
|
||||
def uppercase(string: str) -> str:
|
||||
'''
|
||||
Make the first character of the string uppercase, if the string is non-empty.
|
||||
Makes the first character of the string uppercase, if the string is
|
||||
non-empty.
|
||||
'''
|
||||
if len(string) == 0:
|
||||
return string
|
||||
else:
|
||||
return string[0].upper() + string[1:]
|
||||
|
||||
|
||||
def contains_suspect_unicode(string: str) -> bool:
|
||||
'''
|
||||
Returns whether the given string seems to have suspect Unicode trickery
|
||||
(e.g.: Zalgo text).
|
||||
'''
|
||||
return regex.search(r"\pM{3,}", string) is not None
|
||||
|
|
Loading…
Reference in New Issue