toolbox/waifu/modules/kajiwoto_pdm.py

48 lines
1.9 KiB
Python
Raw Normal View History

2022-12-18 01:36:33 +01:00
import typing as t
from waifu.datasets.kajiwoto import (KajiwotoDataset, generate_variants_for,
replace_special_tokens_in)
from waifu.modules import BaseModule
from waifu.utils.strings import uppercase
USER_PREFIX = "Person 1"
BOT_PREFIX = "Person 2"
class KajiwotoPDM(BaseModule):
'''A Persona Dialogue Module powered by the Kajiwoto dataset.'''
def generator(self) -> t.Generator[str, None, None]:
dataset = KajiwotoDataset()
for episode in dataset:
turns: list[str] = []
metadata = dataset.get_metadata_for_bot(episode[0].bot_id)
# `metadata.personalities` is in a format like: `[["friendly", "20.32"]]`
# but we want that "phrased" closer to natural language, so we build
# `persona_string` to take care of that.
personality_descriptors = [x[0] for x in metadata.personalities]
persona_string = ". ".join(
[uppercase(x) for x in personality_descriptors]) + "."
description_string = metadata.description.replace("\n",
" ").replace(
" ", " ")
turns.append(f"{BOT_PREFIX}'s Description: {description_string}")
turns.append(f"{BOT_PREFIX}'s Persona: {persona_string}")
# Empty turn to have a line break separating description/persona
# and the actual messages.
turns.append("")
for turn in episode:
turns.append(f"{USER_PREFIX}: {turn.user_message}")
turns.append(f"{BOT_PREFIX}: {turn.bot_response}")
string = "\n".join(turns)
processed_string = replace_special_tokens_in(string)
for generated_string in generate_variants_for(processed_string):
yield generated_string