feat: changes to log and discard some not-so-great data
This commit is contained in:
parent
9f55ecfca7
commit
e4594338d2
|
@ -12,6 +12,9 @@ from waifu.utils.dataset import get_data_path
|
||||||
# The regex used to find message variants (e.g.: `%{Hi|Hello} there!`)
|
# The regex used to find message variants (e.g.: `%{Hi|Hello} there!`)
|
||||||
KAJIWOTO_VARIANT_REGEX = re.compile(r'%{(.+?)}')
|
KAJIWOTO_VARIANT_REGEX = re.compile(r'%{(.+?)}')
|
||||||
|
|
||||||
|
# These bots shouldn't be a part of the final dataset, for whatever reason.
|
||||||
|
BLACKLISTED_BOT_IDS = set(["WvqA"])
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@ -262,6 +265,10 @@ def _enumerate_kajiwoto_json_files() -> list[str]:
|
||||||
# Don't want to list metadata files here.
|
# Don't want to list metadata files here.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if item.replace(".json", "") in BLACKLISTED_BOT_IDS:
|
||||||
|
# Don't want blacklisted bots being included.
|
||||||
|
continue
|
||||||
|
|
||||||
item_path = os.path.join(dataset_path, item)
|
item_path = os.path.join(dataset_path, item)
|
||||||
if not os.path.isfile(item_path):
|
if not os.path.isfile(item_path):
|
||||||
# Don't care about folders.
|
# Don't care about folders.
|
||||||
|
|
|
@ -23,6 +23,8 @@ from waifu.utils.dataset import get_data_path
|
||||||
# Matches user mentions, channel links, emotes and maybe other stuff.
|
# Matches user mentions, channel links, emotes and maybe other stuff.
|
||||||
SPECIAL_TOKENS_REGEX = re.compile(r"<[@:#].+?>")
|
SPECIAL_TOKENS_REGEX = re.compile(r"<[@:#].+?>")
|
||||||
|
|
||||||
|
MINIMUM_EPISODE_LENGTH = 5
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@ -45,9 +47,10 @@ class DiscordVDM(BaseModule):
|
||||||
turns, last_message_id = episode_contents
|
turns, last_message_id = episode_contents
|
||||||
|
|
||||||
# Discard short episodes.
|
# Discard short episodes.
|
||||||
if len(turns) < 8:
|
if len(turns) < MINIMUM_EPISODE_LENGTH:
|
||||||
logger.debug("Found short %s-turn episode, discarding.",
|
logger.debug(
|
||||||
len(turns))
|
"Found short %s-turn episode (< %s), discarding.",
|
||||||
|
len(turns), MINIMUM_EPISODE_LENGTH)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Discard conversations with overly short messages.
|
# Discard conversations with overly short messages.
|
||||||
|
@ -194,7 +197,8 @@ def _build_episode_turns(
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if _looks_like_ooc(cleaned_text):
|
if _looks_like_ooc(cleaned_text):
|
||||||
# Self-explanatory.
|
logger.debug("Dropping what _seems_ to be OOC talk: `%s`",
|
||||||
|
cleaned_text)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Get username.
|
# Get username.
|
||||||
|
|
|
@ -10,6 +10,7 @@ import sys
|
||||||
import typing as t
|
import typing as t
|
||||||
|
|
||||||
from waifu.modules import BaseModule
|
from waifu.modules import BaseModule
|
||||||
|
from waifu.utils.strings import contains_suspect_unicode
|
||||||
|
|
||||||
# TODO(11b): Needs manual maintenance to keep up-to-date. Consider doing some
|
# TODO(11b): Needs manual maintenance to keep up-to-date. Consider doing some
|
||||||
# metaprogramming trickery to build this list out instead.
|
# metaprogramming trickery to build this list out instead.
|
||||||
|
@ -118,6 +119,8 @@ def main() -> None:
|
||||||
# file.
|
# file.
|
||||||
for module in modules:
|
for module in modules:
|
||||||
for episode in module():
|
for episode in module():
|
||||||
|
if contains_suspect_unicode(episode):
|
||||||
|
print(f"Found suspect unicode contents in `{episode}`")
|
||||||
json_line = json.dumps({"text": episode})
|
json_line = json.dumps({"text": episode})
|
||||||
output_file.write(f"{json_line}\n")
|
output_file.write(f"{json_line}\n")
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
# Some of this is pasta from Meta's ParlAI. See:
|
# Some of this is pasta from Meta's ParlAI. See:
|
||||||
# https://github.com/facebookresearch/ParlAI/blob/main/parlai/utils/strings.py
|
# https://github.com/facebookresearch/ParlAI/blob/main/parlai/utils/strings.py
|
||||||
|
import regex
|
||||||
|
|
||||||
|
|
||||||
def normalize_string(text: str, version: int = 1) -> str:
|
def normalize_string(text: str, version: int = 1) -> str:
|
||||||
|
@ -54,9 +55,18 @@ def title_case(string: str) -> str:
|
||||||
|
|
||||||
def uppercase(string: str) -> str:
|
def uppercase(string: str) -> str:
|
||||||
'''
|
'''
|
||||||
Make the first character of the string uppercase, if the string is non-empty.
|
Makes the first character of the string uppercase, if the string is
|
||||||
|
non-empty.
|
||||||
'''
|
'''
|
||||||
if len(string) == 0:
|
if len(string) == 0:
|
||||||
return string
|
return string
|
||||||
else:
|
else:
|
||||||
return string[0].upper() + string[1:]
|
return string[0].upper() + string[1:]
|
||||||
|
|
||||||
|
|
||||||
|
def contains_suspect_unicode(string: str) -> bool:
|
||||||
|
'''
|
||||||
|
Returns whether the given string seems to have suspect Unicode trickery
|
||||||
|
(e.g.: Zalgo text).
|
||||||
|
'''
|
||||||
|
return regex.search(r"\pM{3,}", string) is not None
|
||||||
|
|
Loading…
Reference in New Issue