From 47bafe4b9f8c7e8cde9c0783b349c98c045794a4 Mon Sep 17 00:00:00 2001 From: Thomas Goyne Date: Fri, 26 Oct 2012 19:03:56 -0700 Subject: [PATCH] Add a lexer for the body of dialogue lines to libaegisub --- .../libaegisub_vs2008.vcproj | 8 + aegisub/libaegisub/common/parser.cpp | 87 +++++- .../include/libaegisub/ass/dialogue_parser.h | 47 ++++ aegisub/tests/Makefile | 1 + aegisub/tests/libaegisub_dialogue_lexer.cpp | 251 ++++++++++++++++++ 5 files changed, 393 insertions(+), 1 deletion(-) create mode 100644 aegisub/libaegisub/include/libaegisub/ass/dialogue_parser.h create mode 100644 aegisub/tests/libaegisub_dialogue_lexer.cpp diff --git a/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj b/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj index 08b5568c1..3c33d931b 100644 --- a/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj +++ b/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj @@ -542,6 +542,14 @@ > + + + + diff --git a/aegisub/libaegisub/common/parser.cpp b/aegisub/libaegisub/common/parser.cpp index 9d9359cf0..bd383a26c 100644 --- a/aegisub/libaegisub/common/parser.cpp +++ b/aegisub/libaegisub/common/parser.cpp @@ -17,12 +17,15 @@ #include "parser.h" #include "libaegisub/color.h" +#include "libaegisub/ass/dialogue_parser.h" #include #include #include #include #include +#include +#include BOOST_FUSION_ADAPT_STRUCT( agi::Color, @@ -96,13 +99,95 @@ struct color_grammar : qi::grammar { } }; +template +struct dialogue_tokens : lex::lexer { + int paren_depth; + + dialogue_tokens() : paren_depth(0) { + using lex::_state; + using lex::char_; + using lex::string; + using namespace boost::phoenix; + using namespace agi::ass::DialogueTokenType; + + this->self + = string("\\\\[nNh]", LINE_BREAK) + | char_('{', OVR_BEGIN)[ref(paren_depth) = 0, _state = "OVR"] + | string(".", TEXT) + ; + + this->self("OVR") + = char_('{', ERROR) + | char_('}', OVR_END)[_state = "INITIAL"] + | char_('\\', TAG_START)[_state = "TAGSTART"] + | string("\\s+", WHITESPACE) + | string(".", COMMENT) + ; + + this->self("ARG") + = char_('{', ERROR) + | char_('}', OVR_END)[_state = "INITIAL"] + | char_('(', OPEN_PAREN)[++ref(paren_depth)] + | char_(')', CLOSE_PAREN)[--ref(paren_depth), if_(ref(paren_depth) == 0)[_state = "OVR"]] + | char_('\\', TAG_START)[_state = "TAGSTART"] + | char_(',', ARG_SEP) + | string("\\s+", WHITESPACE) + | string(".", ARG) + ; + + this->self("TAGSTART") + = string("\\s+", WHITESPACE) + | string("r|fn", TAG_NAME)[_state = "ARG"] + | char_('\\', TAG_START) + | char_('}', OVR_END)[_state = "INITIAL"] + | string("[a-z0-9]", TAG_NAME)[_state = "TAGNAME"] + | string(".", COMMENT)[_state = "OVR"] + ; + + this->self("TAGNAME") + = string("[a-z]+", TAG_NAME)[_state = "ARG"] + | char_('(', OPEN_PAREN)[++ref(paren_depth), _state = "ARG"] + | char_(')', CLOSE_PAREN)[--ref(paren_depth), if_(ref(paren_depth) == 0)[_state = "OVR"]] + | char_('}', OVR_END)[_state = "INITIAL"] + | char_('\\', TAG_START)[_state = "TAGSTART"] + | string(".", ARG)[_state = "ARG"] + ; + } +}; + } -namespace agi { namespace parser { +namespace agi { +namespace parser { bool parse(Color &dst, std::string const& str) { std::string::const_iterator begin = str.begin(); bool parsed = parse(begin, str.end(), color_grammar(), dst); return parsed && begin == str.end(); } } + +namespace ass { + std::vector TokenizeDialogueBody(std::string const& str) { + dialogue_tokens > tokenizer; + + char const* first = str.c_str(); + char const* last = first + str.size(); + std::vector data; + dialogue_tokens >::iterator_type + it = tokenizer.begin(first, last), + end = tokenizer.end(); + + for (; it != end && token_is_valid(*it); ++it) { + int id = it->id(); + ptrdiff_t len = it->value().end() - it->value().begin(); + assert(len > 0); + if (data.empty() || data.back().type != id) + data.push_back(DialogueToken(id, len)); + else + data.back().length += len; + } + + return data; + } +} } diff --git a/aegisub/libaegisub/include/libaegisub/ass/dialogue_parser.h b/aegisub/libaegisub/include/libaegisub/ass/dialogue_parser.h new file mode 100644 index 000000000..5c2a10f06 --- /dev/null +++ b/aegisub/libaegisub/include/libaegisub/ass/dialogue_parser.h @@ -0,0 +1,47 @@ +// Copyright (c) 2012, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#ifndef LAGI_PRE +#include +#endif + +namespace agi { + namespace ass { + namespace DialogueTokenType { + enum { + TEXT = 1000, + LINE_BREAK, + OVR_BEGIN, + OVR_END, + TAG_START, + TAG_NAME, + OPEN_PAREN, + CLOSE_PAREN, + ARG_SEP, + ARG, + ERROR, + COMMENT, + WHITESPACE + }; + } + + struct DialogueToken { + int type; + size_t length; + DialogueToken(int type, size_t length) : type(type), length(length) { } + }; + + std::vector TokenizeDialogueBody(std::string const& str); + } +} diff --git a/aegisub/tests/Makefile b/aegisub/tests/Makefile index 383cb5056..7b8adfc7f 100644 --- a/aegisub/tests/Makefile +++ b/aegisub/tests/Makefile @@ -20,6 +20,7 @@ SRC = \ libaegisub_access.cpp \ libaegisub_cajun.cpp \ libaegisub_color.cpp \ + libaegisub_dialogue_lexer.cpp \ libaegisub_hotkey.cpp \ libaegisub_iconv.cpp \ libaegisub_keyframe.cpp \ diff --git a/aegisub/tests/libaegisub_dialogue_lexer.cpp b/aegisub/tests/libaegisub_dialogue_lexer.cpp new file mode 100644 index 000000000..1ced1771d --- /dev/null +++ b/aegisub/tests/libaegisub_dialogue_lexer.cpp @@ -0,0 +1,251 @@ +// Copyright (c) 2012, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#include + +#include "main.h" +#include "util.h" + +class lagi_dialogue_lexer : public libagi { +}; + +using namespace agi::ass; + +TEST(lagi_dialogue_lexer, empty) { + ASSERT_TRUE(TokenizeDialogueBody("").empty()); +} + +#define tok_str(arg1, ...) do { \ + std::string str = arg1; \ + std::vector tok = TokenizeDialogueBody(str); \ + size_t token_index = 0; \ + __VA_ARGS__ \ + EXPECT_EQ(token_index, tok.size()); \ +} while(false) + +#define expect_tok(expected_type, expected_len) do { \ + EXPECT_LT(token_index, tok.size()); \ + if (token_index < tok.size()) { \ + EXPECT_EQ(DialogueTokenType::expected_type, tok[token_index].type); \ + EXPECT_EQ(expected_len, tok[token_index].length); \ + ++token_index; \ + } \ +} while(false) + +TEST(lagi_dialogue_lexer, plain_text) { + tok_str("hello there", + expect_tok(TEXT, 11); + ); + + tok_str("hello\\Nthere", + expect_tok(TEXT, 5); + expect_tok(LINE_BREAK, 2); + expect_tok(TEXT, 5); + ); + + tok_str("hello\\n\\h\\kthere", + expect_tok(TEXT, 5); + expect_tok(LINE_BREAK, 4); + expect_tok(TEXT, 7); + ); +} + +TEST(lagi_dialogue_lexer, basic_override_tags) { + tok_str("{\\b1}bold text{\\b0}", + expect_tok(OVR_BEGIN, 1); + expect_tok(TAG_START, 1); + expect_tok(TAG_NAME, 1); + expect_tok(ARG, 1); + expect_tok(OVR_END, 1); + expect_tok(TEXT, 9); + expect_tok(OVR_BEGIN, 1); + expect_tok(TAG_START, 1); + expect_tok(TAG_NAME, 1); + expect_tok(ARG, 1); + expect_tok(OVR_END, 1); + ); + + tok_str("{\\fnComic Sans MS}text", + expect_tok(OVR_BEGIN, 1); + expect_tok(TAG_START, 1); + expect_tok(TAG_NAME, 2); + expect_tok(ARG, 5); + expect_tok(WHITESPACE, 1); + expect_tok(ARG, 4); + expect_tok(WHITESPACE, 1); + expect_tok(ARG, 2); + expect_tok(OVR_END, 1); + expect_tok(TEXT, 4); + ); + + tok_str("{\\pos(0,0)}a", + expect_tok(OVR_BEGIN, 1); + expect_tok(TAG_START, 1); + expect_tok(TAG_NAME, 3); + expect_tok(OPEN_PAREN, 1); + expect_tok(ARG, 1); + expect_tok(ARG_SEP, 1); + expect_tok(ARG, 1); + expect_tok(CLOSE_PAREN, 1); + expect_tok(OVR_END, 1); + expect_tok(TEXT, 1); + ); + + tok_str("{\\pos( 0 , 0 )}a", + expect_tok(OVR_BEGIN, 1); + expect_tok(TAG_START, 1); + expect_tok(TAG_NAME, 3); + expect_tok(OPEN_PAREN, 1); + expect_tok(WHITESPACE, 1); + expect_tok(ARG, 1); + expect_tok(WHITESPACE, 1); + expect_tok(ARG_SEP, 1); + expect_tok(WHITESPACE, 1); + expect_tok(ARG, 1); + expect_tok(WHITESPACE, 1); + expect_tok(CLOSE_PAREN, 1); + expect_tok(OVR_END, 1); + expect_tok(TEXT, 1); + ); + + tok_str("{\\c&HFFFFFF&\\2c&H0000FF&\\3c&H000000&}a", + expect_tok(OVR_BEGIN, 1); + expect_tok(TAG_START, 1); + expect_tok(TAG_NAME, 1); + expect_tok(ARG, 9); + expect_tok(TAG_START, 1); + expect_tok(TAG_NAME, 2); + expect_tok(ARG, 9); + expect_tok(TAG_START, 1); + expect_tok(TAG_NAME, 2); + expect_tok(ARG, 9); + expect_tok(OVR_END, 1); + expect_tok(TEXT, 1); + ); + + tok_str("{\\t(0,100,\\clip(1, m 0 0 l 10 10 10 20))}a", + expect_tok(OVR_BEGIN, 1); + expect_tok(TAG_START, 1); + expect_tok(TAG_NAME, 1); + expect_tok(OPEN_PAREN, 1); + expect_tok(ARG, 1); + expect_tok(ARG_SEP, 1); + expect_tok(ARG, 3); + expect_tok(ARG_SEP, 1); + expect_tok(TAG_START, 1); + expect_tok(TAG_NAME, 4); + expect_tok(OPEN_PAREN, 1); + expect_tok(ARG, 1); + expect_tok(ARG_SEP, 1); + expect_tok(WHITESPACE, 1); + expect_tok(ARG, 1); + expect_tok(WHITESPACE, 1); + expect_tok(ARG, 1); + expect_tok(WHITESPACE, 1); + expect_tok(ARG, 1); + expect_tok(WHITESPACE, 1); + expect_tok(ARG, 1); + expect_tok(WHITESPACE, 1); + expect_tok(ARG, 2); + expect_tok(WHITESPACE, 1); + expect_tok(ARG, 2); + expect_tok(WHITESPACE, 1); + expect_tok(ARG, 2); + expect_tok(WHITESPACE, 1); + expect_tok(ARG, 2); + expect_tok(CLOSE_PAREN, 2); + expect_tok(OVR_END, 1); + expect_tok(TEXT, 1); + ); +} + +TEST(lagi_dialogue_lexer, merging) { + tok_str("{\\b\\b", + expect_tok(OVR_BEGIN, 1); + expect_tok(TAG_START, 1); + expect_tok(TAG_NAME, 1); + expect_tok(TAG_START, 1); + expect_tok(TAG_NAME, 1); + ); +} + +TEST(lagi_dialogue_lexer, whitespace) { + tok_str("{ \\ fn Comic Sans MS }asd", + expect_tok(OVR_BEGIN, 1); + expect_tok(WHITESPACE, 1); + expect_tok(TAG_START, 1); + expect_tok(WHITESPACE, 1); + expect_tok(TAG_NAME, 2); + expect_tok(WHITESPACE, 1); + expect_tok(ARG, 5); + expect_tok(WHITESPACE, 1); + expect_tok(ARG, 4); + expect_tok(WHITESPACE, 1); + expect_tok(ARG, 2); + expect_tok(WHITESPACE, 1); + expect_tok(OVR_END, 1); + expect_tok(TEXT, 3); + ); +} + +TEST(lagi_dialogue_lexer, comment) { + tok_str("{a}b", + expect_tok(OVR_BEGIN, 1); + expect_tok(COMMENT, 1); + expect_tok(OVR_END, 1); + expect_tok(TEXT, 1); + ); + + tok_str("{a\\b}c", + expect_tok(OVR_BEGIN, 1); + expect_tok(COMMENT, 1); + expect_tok(TAG_START, 1); + expect_tok(TAG_NAME, 1); + expect_tok(OVR_END, 1); + expect_tok(TEXT, 1); + ); +} + +TEST(lagi_dialogue_lexer, malformed) { + tok_str("}", + expect_tok(TEXT, 1); + ); + + tok_str("{{", + expect_tok(OVR_BEGIN, 1); + expect_tok(ERROR, 1); + ); + + tok_str("{\\pos(0,0}a", + expect_tok(OVR_BEGIN, 1); + expect_tok(TAG_START, 1); + expect_tok(TAG_NAME, 3); + expect_tok(OPEN_PAREN, 1); + expect_tok(ARG, 1); + expect_tok(ARG_SEP, 1); + expect_tok(ARG, 1); + expect_tok(OVR_END, 1); + expect_tok(TEXT, 1); + ); + + tok_str("{\\b1\\}asdf", + expect_tok(OVR_BEGIN, 1); + expect_tok(TAG_START, 1); + expect_tok(TAG_NAME, 1); + expect_tok(ARG, 1); + expect_tok(TAG_START, 1); + expect_tok(OVR_END, 1); + expect_tok(TEXT, 4); + ); +}