Add a lexer for the body of dialogue lines to libaegisub

This commit is contained in:
Thomas Goyne 2012-10-26 19:03:56 -07:00
parent bd78692148
commit 47bafe4b9f
5 changed files with 393 additions and 1 deletions

View File

@ -542,6 +542,14 @@
>
</File>
</Filter>
<Filter
Name="ASS"
>
<File
RelativePath="..\..\libaegisub\include\libaegisub\ass\dialogue_parser.h"
>
</File>
</Filter>
<File
RelativePath="..\..\libaegisub\lagi_pre.h"
>

View File

@ -17,12 +17,15 @@
#include "parser.h"
#include "libaegisub/color.h"
#include "libaegisub/ass/dialogue_parser.h"
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix_core.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_fusion.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/home/phoenix/statement.hpp>
BOOST_FUSION_ADAPT_STRUCT(
agi::Color,
@ -96,13 +99,95 @@ struct color_grammar : qi::grammar<Iterator, agi::Color()> {
}
};
template <typename Lexer>
struct dialogue_tokens : lex::lexer<Lexer> {
int paren_depth;
dialogue_tokens() : paren_depth(0) {
using lex::_state;
using lex::char_;
using lex::string;
using namespace boost::phoenix;
using namespace agi::ass::DialogueTokenType;
this->self
= string("\\\\[nNh]", LINE_BREAK)
| char_('{', OVR_BEGIN)[ref(paren_depth) = 0, _state = "OVR"]
| string(".", TEXT)
;
this->self("OVR")
= char_('{', ERROR)
| char_('}', OVR_END)[_state = "INITIAL"]
| char_('\\', TAG_START)[_state = "TAGSTART"]
| string("\\s+", WHITESPACE)
| string(".", COMMENT)
;
this->self("ARG")
= char_('{', ERROR)
| char_('}', OVR_END)[_state = "INITIAL"]
| char_('(', OPEN_PAREN)[++ref(paren_depth)]
| char_(')', CLOSE_PAREN)[--ref(paren_depth), if_(ref(paren_depth) == 0)[_state = "OVR"]]
| char_('\\', TAG_START)[_state = "TAGSTART"]
| char_(',', ARG_SEP)
| string("\\s+", WHITESPACE)
| string(".", ARG)
;
this->self("TAGSTART")
= string("\\s+", WHITESPACE)
| string("r|fn", TAG_NAME)[_state = "ARG"]
| char_('\\', TAG_START)
| char_('}', OVR_END)[_state = "INITIAL"]
| string("[a-z0-9]", TAG_NAME)[_state = "TAGNAME"]
| string(".", COMMENT)[_state = "OVR"]
;
this->self("TAGNAME")
= string("[a-z]+", TAG_NAME)[_state = "ARG"]
| char_('(', OPEN_PAREN)[++ref(paren_depth), _state = "ARG"]
| char_(')', CLOSE_PAREN)[--ref(paren_depth), if_(ref(paren_depth) == 0)[_state = "OVR"]]
| char_('}', OVR_END)[_state = "INITIAL"]
| char_('\\', TAG_START)[_state = "TAGSTART"]
| string(".", ARG)[_state = "ARG"]
;
}
};
}
namespace agi { namespace parser {
namespace agi {
namespace parser {
bool parse(Color &dst, std::string const& str) {
std::string::const_iterator begin = str.begin();
bool parsed = parse(begin, str.end(), color_grammar<std::string::const_iterator>(), dst);
return parsed && begin == str.end();
}
}
namespace ass {
std::vector<DialogueToken> TokenizeDialogueBody(std::string const& str) {
dialogue_tokens<lex::lexertl::actor_lexer<> > tokenizer;
char const* first = str.c_str();
char const* last = first + str.size();
std::vector<DialogueToken> data;
dialogue_tokens<lex::lexertl::actor_lexer<> >::iterator_type
it = tokenizer.begin(first, last),
end = tokenizer.end();
for (; it != end && token_is_valid(*it); ++it) {
int id = it->id();
ptrdiff_t len = it->value().end() - it->value().begin();
assert(len > 0);
if (data.empty() || data.back().type != id)
data.push_back(DialogueToken(id, len));
else
data.back().length += len;
}
return data;
}
}
}

View File

@ -0,0 +1,47 @@
// Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
//
// Permission to use, copy, modify, and distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#ifndef LAGI_PRE
#include <vector>
#endif
namespace agi {
namespace ass {
namespace DialogueTokenType {
enum {
TEXT = 1000,
LINE_BREAK,
OVR_BEGIN,
OVR_END,
TAG_START,
TAG_NAME,
OPEN_PAREN,
CLOSE_PAREN,
ARG_SEP,
ARG,
ERROR,
COMMENT,
WHITESPACE
};
}
struct DialogueToken {
int type;
size_t length;
DialogueToken(int type, size_t length) : type(type), length(length) { }
};
std::vector<DialogueToken> TokenizeDialogueBody(std::string const& str);
}
}

View File

@ -20,6 +20,7 @@ SRC = \
libaegisub_access.cpp \
libaegisub_cajun.cpp \
libaegisub_color.cpp \
libaegisub_dialogue_lexer.cpp \
libaegisub_hotkey.cpp \
libaegisub_iconv.cpp \
libaegisub_keyframe.cpp \

View File

@ -0,0 +1,251 @@
// Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
//
// Permission to use, copy, modify, and distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#include <libaegisub/ass/dialogue_parser.h>
#include "main.h"
#include "util.h"
class lagi_dialogue_lexer : public libagi {
};
using namespace agi::ass;
TEST(lagi_dialogue_lexer, empty) {
ASSERT_TRUE(TokenizeDialogueBody("").empty());
}
#define tok_str(arg1, ...) do { \
std::string str = arg1; \
std::vector<DialogueToken> tok = TokenizeDialogueBody(str); \
size_t token_index = 0; \
__VA_ARGS__ \
EXPECT_EQ(token_index, tok.size()); \
} while(false)
#define expect_tok(expected_type, expected_len) do { \
EXPECT_LT(token_index, tok.size()); \
if (token_index < tok.size()) { \
EXPECT_EQ(DialogueTokenType::expected_type, tok[token_index].type); \
EXPECT_EQ(expected_len, tok[token_index].length); \
++token_index; \
} \
} while(false)
TEST(lagi_dialogue_lexer, plain_text) {
tok_str("hello there",
expect_tok(TEXT, 11);
);
tok_str("hello\\Nthere",
expect_tok(TEXT, 5);
expect_tok(LINE_BREAK, 2);
expect_tok(TEXT, 5);
);
tok_str("hello\\n\\h\\kthere",
expect_tok(TEXT, 5);
expect_tok(LINE_BREAK, 4);
expect_tok(TEXT, 7);
);
}
TEST(lagi_dialogue_lexer, basic_override_tags) {
tok_str("{\\b1}bold text{\\b0}",
expect_tok(OVR_BEGIN, 1);
expect_tok(TAG_START, 1);
expect_tok(TAG_NAME, 1);
expect_tok(ARG, 1);
expect_tok(OVR_END, 1);
expect_tok(TEXT, 9);
expect_tok(OVR_BEGIN, 1);
expect_tok(TAG_START, 1);
expect_tok(TAG_NAME, 1);
expect_tok(ARG, 1);
expect_tok(OVR_END, 1);
);
tok_str("{\\fnComic Sans MS}text",
expect_tok(OVR_BEGIN, 1);
expect_tok(TAG_START, 1);
expect_tok(TAG_NAME, 2);
expect_tok(ARG, 5);
expect_tok(WHITESPACE, 1);
expect_tok(ARG, 4);
expect_tok(WHITESPACE, 1);
expect_tok(ARG, 2);
expect_tok(OVR_END, 1);
expect_tok(TEXT, 4);
);
tok_str("{\\pos(0,0)}a",
expect_tok(OVR_BEGIN, 1);
expect_tok(TAG_START, 1);
expect_tok(TAG_NAME, 3);
expect_tok(OPEN_PAREN, 1);
expect_tok(ARG, 1);
expect_tok(ARG_SEP, 1);
expect_tok(ARG, 1);
expect_tok(CLOSE_PAREN, 1);
expect_tok(OVR_END, 1);
expect_tok(TEXT, 1);
);
tok_str("{\\pos( 0 , 0 )}a",
expect_tok(OVR_BEGIN, 1);
expect_tok(TAG_START, 1);
expect_tok(TAG_NAME, 3);
expect_tok(OPEN_PAREN, 1);
expect_tok(WHITESPACE, 1);
expect_tok(ARG, 1);
expect_tok(WHITESPACE, 1);
expect_tok(ARG_SEP, 1);
expect_tok(WHITESPACE, 1);
expect_tok(ARG, 1);
expect_tok(WHITESPACE, 1);
expect_tok(CLOSE_PAREN, 1);
expect_tok(OVR_END, 1);
expect_tok(TEXT, 1);
);
tok_str("{\\c&HFFFFFF&\\2c&H0000FF&\\3c&H000000&}a",
expect_tok(OVR_BEGIN, 1);
expect_tok(TAG_START, 1);
expect_tok(TAG_NAME, 1);
expect_tok(ARG, 9);
expect_tok(TAG_START, 1);
expect_tok(TAG_NAME, 2);
expect_tok(ARG, 9);
expect_tok(TAG_START, 1);
expect_tok(TAG_NAME, 2);
expect_tok(ARG, 9);
expect_tok(OVR_END, 1);
expect_tok(TEXT, 1);
);
tok_str("{\\t(0,100,\\clip(1, m 0 0 l 10 10 10 20))}a",
expect_tok(OVR_BEGIN, 1);
expect_tok(TAG_START, 1);
expect_tok(TAG_NAME, 1);
expect_tok(OPEN_PAREN, 1);
expect_tok(ARG, 1);
expect_tok(ARG_SEP, 1);
expect_tok(ARG, 3);
expect_tok(ARG_SEP, 1);
expect_tok(TAG_START, 1);
expect_tok(TAG_NAME, 4);
expect_tok(OPEN_PAREN, 1);
expect_tok(ARG, 1);
expect_tok(ARG_SEP, 1);
expect_tok(WHITESPACE, 1);
expect_tok(ARG, 1);
expect_tok(WHITESPACE, 1);
expect_tok(ARG, 1);
expect_tok(WHITESPACE, 1);
expect_tok(ARG, 1);
expect_tok(WHITESPACE, 1);
expect_tok(ARG, 1);
expect_tok(WHITESPACE, 1);
expect_tok(ARG, 2);
expect_tok(WHITESPACE, 1);
expect_tok(ARG, 2);
expect_tok(WHITESPACE, 1);
expect_tok(ARG, 2);
expect_tok(WHITESPACE, 1);
expect_tok(ARG, 2);
expect_tok(CLOSE_PAREN, 2);
expect_tok(OVR_END, 1);
expect_tok(TEXT, 1);
);
}
TEST(lagi_dialogue_lexer, merging) {
tok_str("{\\b\\b",
expect_tok(OVR_BEGIN, 1);
expect_tok(TAG_START, 1);
expect_tok(TAG_NAME, 1);
expect_tok(TAG_START, 1);
expect_tok(TAG_NAME, 1);
);
}
TEST(lagi_dialogue_lexer, whitespace) {
tok_str("{ \\ fn Comic Sans MS }asd",
expect_tok(OVR_BEGIN, 1);
expect_tok(WHITESPACE, 1);
expect_tok(TAG_START, 1);
expect_tok(WHITESPACE, 1);
expect_tok(TAG_NAME, 2);
expect_tok(WHITESPACE, 1);
expect_tok(ARG, 5);
expect_tok(WHITESPACE, 1);
expect_tok(ARG, 4);
expect_tok(WHITESPACE, 1);
expect_tok(ARG, 2);
expect_tok(WHITESPACE, 1);
expect_tok(OVR_END, 1);
expect_tok(TEXT, 3);
);
}
TEST(lagi_dialogue_lexer, comment) {
tok_str("{a}b",
expect_tok(OVR_BEGIN, 1);
expect_tok(COMMENT, 1);
expect_tok(OVR_END, 1);
expect_tok(TEXT, 1);
);
tok_str("{a\\b}c",
expect_tok(OVR_BEGIN, 1);
expect_tok(COMMENT, 1);
expect_tok(TAG_START, 1);
expect_tok(TAG_NAME, 1);
expect_tok(OVR_END, 1);
expect_tok(TEXT, 1);
);
}
TEST(lagi_dialogue_lexer, malformed) {
tok_str("}",
expect_tok(TEXT, 1);
);
tok_str("{{",
expect_tok(OVR_BEGIN, 1);
expect_tok(ERROR, 1);
);
tok_str("{\\pos(0,0}a",
expect_tok(OVR_BEGIN, 1);
expect_tok(TAG_START, 1);
expect_tok(TAG_NAME, 3);
expect_tok(OPEN_PAREN, 1);
expect_tok(ARG, 1);
expect_tok(ARG_SEP, 1);
expect_tok(ARG, 1);
expect_tok(OVR_END, 1);
expect_tok(TEXT, 1);
);
tok_str("{\\b1\\}asdf",
expect_tok(OVR_BEGIN, 1);
expect_tok(TAG_START, 1);
expect_tok(TAG_NAME, 1);
expect_tok(ARG, 1);
expect_tok(TAG_START, 1);
expect_tok(OVR_END, 1);
expect_tok(TEXT, 4);
);
}