From 88fdee726b2603c019994c387ae44c3168dca0b4 Mon Sep 17 00:00:00 2001 From: Thomas Goyne Date: Tue, 6 Nov 2012 16:26:00 -0800 Subject: [PATCH] Extract word-splitting logic from the syntax highlighter --- aegisub/libaegisub/ass/dialogue_parser.cpp | 194 +++++++++++------- .../include/libaegisub/ass/dialogue_parser.h | 9 +- aegisub/src/dialog_spellchecker.cpp | 49 +++-- aegisub/src/dialog_spellchecker.h | 8 +- aegisub/src/scintilla_text_ctrl.cpp | 26 --- aegisub/src/scintilla_text_ctrl.h | 4 +- aegisub/src/subs_edit_ctrl.cpp | 46 +++-- aegisub/src/subs_edit_ctrl.h | 4 +- aegisub/src/utils.cpp | 69 ------- aegisub/src/utils.h | 12 -- aegisub/tests/Makefile | 5 +- aegisub/tests/libaegisub_syntax_highlight.cpp | 151 ++++++++++++++ aegisub/tests/libaegisub_word_split.cpp | 135 ++++++++++++ 13 files changed, 482 insertions(+), 230 deletions(-) create mode 100644 aegisub/tests/libaegisub_syntax_highlight.cpp create mode 100644 aegisub/tests/libaegisub_word_split.cpp diff --git a/aegisub/libaegisub/ass/dialogue_parser.cpp b/aegisub/libaegisub/ass/dialogue_parser.cpp index 14ef7e863..effe0ea7b 100644 --- a/aegisub/libaegisub/ass/dialogue_parser.cpp +++ b/aegisub/libaegisub/ass/dialogue_parser.cpp @@ -26,49 +26,76 @@ namespace { typedef std::vector TokenVec; -namespace dt = agi::ass::DialogueTokenType; -namespace ss = agi::ass::SyntaxStyle; +using namespace agi::ass; +namespace dt = DialogueTokenType; +namespace ss = SyntaxStyle; class SyntaxHighlighter { TokenVec ranges; std::string const& text; agi::SpellChecker *spellchecker; - agi::scoped_holder utf8_to_utf32; void SetStyling(int len, int type) { if (ranges.size() && ranges.back().type == type) ranges.back().length += len; else - ranges.push_back(agi::ass::DialogueToken(type, len)); + ranges.push_back(DialogueToken(type, len)); } - void CheckWord(int start, int end) { - int len = end - start; - if (!len) return; +public: + SyntaxHighlighter(std::string const& text, agi::SpellChecker *spellchecker) + : text(text) + , spellchecker(spellchecker) + { } - if (!spellchecker->CheckWord(text.substr(start, len))) - SetStyling(len, ss::SPELLING); - else - SetStyling(len, ss::NORMAL); + TokenVec Highlight(TokenVec const& tokens, bool template_line) { + if (tokens.empty()) return ranges; + + size_t pos = 0; + + for (size_t i = 0; i < tokens.size(); ++i) { + size_t len = tokens[i].length; + switch (tokens[i].type) { + case dt::LINE_BREAK: SetStyling(len, ss::LINE_BREAK); break; + case dt::ERROR: SetStyling(len, ss::ERROR); break; + case dt::ARG: SetStyling(len, ss::PARAMETER); break; + case dt::COMMENT: SetStyling(len, ss::COMMENT); break; + case dt::WHITESPACE: SetStyling(len, ss::NORMAL); break; + case dt::DRAWING: SetStyling(len, ss::DRAWING); break; + case dt::TEXT: SetStyling(len, ss::NORMAL); break; + case dt::TAG_NAME: SetStyling(len, ss::TAG); break; + case dt::OPEN_PAREN: case dt::CLOSE_PAREN: case dt::ARG_SEP: case dt::TAG_START: + SetStyling(len, ss::PUNCTUATION); + break; + case dt::OVR_BEGIN: case dt::OVR_END: + SetStyling(len, ss::OVERRIDE); + break; + case dt::WORD: + if (spellchecker && !spellchecker->CheckWord(text.substr(pos, len))) + SetStyling(len, ss::SPELLING); + else + SetStyling(len, ss::NORMAL); + break; + } + + pos += len; + // karaoke templater + } + + return ranges; } +}; - int NextChar(int pos, int len, int& char_len) { - int chr = 0; - char *inptr = const_cast(&text[pos]); - size_t inlen = len; - char *outptr = (char *)&chr; - size_t outlen = sizeof chr; +class WordSplitter { + std::string const& text; + std::vector &tokens; + agi::scoped_holder utf8_to_utf32; + size_t last_ovr_end; + size_t pos; + bool in_drawing; - iconv(utf8_to_utf32, &inptr, &inlen, &outptr, &outlen); - if (outlen != 0) - return 0; - - char_len = len - inlen; - return chr; - } - - void StyleSpellCheck(int pos, int len) { - const int delims[] = { + bool IsWordSep(int chr) { + static const int delims[] = { 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x003a, 0x003b, 0x003d, 0x003f, 0x0040, 0x005b, 0x005c, 0x005d, 0x005e, @@ -86,38 +113,72 @@ class SyntaxHighlighter { 0xff5b, 0xff5d, 0xff5e }; - int chrlen = 0; - int start = pos; - for (; len > 0; pos += chrlen, len -= chrlen) { - int chr = NextChar(pos, len, chrlen); - if (!chr) return; + return std::binary_search(std::begin(delims), std::end(delims), chr); + } - if (std::binary_search(std::begin(delims), std::end(delims), chr)) { - CheckWord(start, pos); - SetStyling(1, ss::NORMAL); - start = pos + 1; - } + int NextChar(int pos, int len, int& char_len) { + int chr = 0; + char *inptr = const_cast(&text[pos]); + size_t inlen = len; + char *outptr = (char *)&chr; + size_t outlen = sizeof chr; + + iconv(utf8_to_utf32, &inptr, &inlen, &outptr, &outlen); + if (outlen != 0) + return 0; + + char_len = len - inlen; + return chr; + } + + void SwitchTo(size_t &i, int type, int len) { + if (tokens[i].type == type) return; + + if (tokens[i].length == (size_t)len) + tokens[i].type = type; + else { + tokens.insert(tokens.begin() + i + 1, DialogueToken(type, len)); + tokens[i].length -= len; + ++i; + ++last_ovr_end; + } + } + + void SplitText(size_t &i) { + if (in_drawing) { + tokens[i].type = dt::DRAWING; + return; } - CheckWord(start, pos); + int chrlen = 0; + int len = tokens[i].length; + int tpos = pos; + for (; len > 0; tpos += chrlen, len -= chrlen) { + int chr = NextChar(tpos, len, chrlen); + if (!chr) return; + + if (IsWordSep(chr)) + SwitchTo(i, dt::TEXT, len); + else + SwitchTo(i, dt::WORD, len); + } } public: - SyntaxHighlighter(std::string const& text, agi::SpellChecker *spellchecker) + WordSplitter(std::string const& text, std::vector &tokens) : text(text) - , spellchecker(spellchecker) + , tokens(tokens) , utf8_to_utf32(iconv_open("utf-32le", "utf-8"), iconv_close) + , last_ovr_end(0) + , pos(0) + , in_drawing(false) { } - TokenVec Highlight(TokenVec const& tokens, bool template_line) { - if (tokens.empty()) return ranges; - - bool in_drawing = false; - size_t pos = 0; + void SplitWords() { + if (tokens.empty()) return; // VSFilter treats unclosed override blocks as plain text, so pretend // all tokens after the last override block are TEXT - size_t last_ovr_end = 0; for (size_t i = tokens.size(); i > 0; --i) { if (tokens[i - 1].type == dt::OVR_END) { last_ovr_end = i - 1; @@ -127,30 +188,14 @@ public: for (size_t i = 0; i < tokens.size(); ++i) { size_t len = tokens[i].length; - switch (i > last_ovr_end ? dt::TEXT : tokens[i].type) { - case dt::LINE_BREAK: SetStyling(len, ss::LINE_BREAK); break; - case dt::ERROR: SetStyling(len, ss::ERROR); break; - case dt::ARG: SetStyling(len, ss::PARAMETER); break; - case dt::COMMENT: SetStyling(len, ss::COMMENT); break; - case dt::WHITESPACE: SetStyling(len, ss::NORMAL); break; - case dt::OPEN_PAREN: case dt::CLOSE_PAREN: case dt::ARG_SEP: case dt::TAG_START: - SetStyling(len, ss::PUNCTUATION); - break; - case dt::OVR_BEGIN: case dt::OVR_END: - SetStyling(len, ss::OVERRIDE); - break; - - case dt::TEXT: - if (in_drawing) - SetStyling(len, ss::DRAWING); - else if (spellchecker) - StyleSpellCheck(pos, len); - else - SetStyling(len, ss::NORMAL); - break; - + switch (tokens[i].type) { + case dt::LINE_BREAK: break; + case dt::TEXT: SplitText(i); break; case dt::TAG_NAME: - SetStyling(len, ss::TAG); + if (i > last_ovr_end) { + SplitText(i); + break; + } if (len != 1 || i + 1 >= tokens.size() || text[pos] != 'p') break; @@ -170,13 +215,14 @@ public: break; } break; + default: + if (i > last_ovr_end) + SplitText(i); + break; } pos += len; - // karaoke templater } - - return ranges; } }; } @@ -188,5 +234,9 @@ std::vector SyntaxHighlight(std::string const& text, std::vector< return SyntaxHighlighter(text, spellchecker).Highlight(tokens, template_line); } +void SplitWords(std::string const& str, std::vector &tokens) { + WordSplitter(str, tokens).SplitWords(); +} + } } diff --git a/aegisub/libaegisub/include/libaegisub/ass/dialogue_parser.h b/aegisub/libaegisub/include/libaegisub/ass/dialogue_parser.h index 87624ef76..871c65f9d 100644 --- a/aegisub/libaegisub/include/libaegisub/ass/dialogue_parser.h +++ b/aegisub/libaegisub/include/libaegisub/ass/dialogue_parser.h @@ -27,6 +27,7 @@ namespace agi { namespace DialogueTokenType { enum { TEXT = 1000, + WORD, LINE_BREAK, OVR_BEGIN, OVR_END, @@ -38,7 +39,8 @@ namespace agi { ARG, ERROR, COMMENT, - WHITESPACE + WHITESPACE, + DRAWING }; } @@ -66,8 +68,13 @@ namespace agi { DialogueToken(int type, size_t length) : type(type), length(length) { } }; + /// Tokenize the passed string as the body of a dialogue line std::vector TokenizeDialogueBody(std::string const& str); + /// Split the words in the TEXT tokens of the lexed line into their + /// own tokens and convert the body of drawings to DRAWING tokens + void SplitWords(std::string const& str, std::vector &tokens); + std::vector SyntaxHighlight(std::string const& text, std::vector const& tokens, bool template_line, SpellChecker *spellchecker); } } diff --git a/aegisub/src/dialog_spellchecker.cpp b/aegisub/src/dialog_spellchecker.cpp index 6e5d07e69..f0050dad1 100644 --- a/aegisub/src/dialog_spellchecker.cpp +++ b/aegisub/src/dialog_spellchecker.cpp @@ -47,6 +47,7 @@ #include "subs_edit_ctrl.h" #include "utils.h" +#include #include #include @@ -168,7 +169,7 @@ void DialogSpellChecker::OnReplace(wxCommandEvent&) { } void DialogSpellChecker::OnReplaceAll(wxCommandEvent&) { - auto_replace[orig_word->GetValue()] = replace_word->GetValue(); + auto_replace[from_wx(orig_word->GetValue())] = from_wx(replace_word->GetValue()); Replace(); FindNext(); @@ -179,7 +180,7 @@ void DialogSpellChecker::OnIgnore(wxCommandEvent&) { } void DialogSpellChecker::OnIgnoreAll(wxCommandEvent&) { - auto_ignore.insert(orig_word->GetValue()); + auto_ignore.emplace(from_wx(orig_word->GetValue())); FindNext(); } @@ -247,19 +248,22 @@ bool DialogSpellChecker::FindNext() { bool DialogSpellChecker::CheckLine(AssDialogue *active_line, int start_pos, int *commit_id) { if (active_line->Comment && skip_comments->GetValue()) return false; - IntPairVector results; - GetWordBoundaries(active_line->Text, results); + std::string text = from_wx(active_line->Text); + auto tokens = agi::ass::TokenizeDialogueBody(text); + agi::ass::SplitWords(text, tokens); - int shift = 0; - for (auto const& result : results) { - word_start = result.first + shift; + word_start = 0; + for (auto const& tok : tokens) { + word_start += tok.length; + if (tok.type != agi::ass::DialogueTokenType::WORD) continue; if (word_start < start_pos) continue; - word_end = result.second + shift; - wxString word = active_line->Text.Mid(word_start, word_end - word_start); - if (auto_ignore.count(word) || spellchecker->CheckWord(from_wx(word))) continue; + word_len = tok.length; + std::string word = text.substr(word_start, word_len); - std::map::const_iterator auto_rep = auto_replace.find(word); + if (auto_ignore.count(word) || spellchecker->CheckWord(word)) continue; + + auto auto_rep = auto_replace.find(word); if (auto_rep == auto_replace.end()) { #ifdef __WXGTK__ // http://trac.wxwidgets.org/ticket/14369 @@ -274,9 +278,10 @@ bool DialogSpellChecker::CheckLine(AssDialogue *active_line, int start_pos, int return true; } - active_line->Text = active_line->Text.Left(word_start) + auto_rep->second + active_line->Text.Mid(word_end); + text.replace(word_start, word_len, auto_rep->second); + active_line->Text = from_wx(text); *commit_id = context->ass->Commit(_("spell check replace"), AssFile::COMMIT_DIAG_TEXT, *commit_id); - shift += auto_rep->second.size() - auto_rep->first.size(); + word_start += auto_rep->second.size() - auto_rep->first.size(); } return false; } @@ -285,23 +290,23 @@ void DialogSpellChecker::Replace() { AssDialogue *active_line = context->selectionController->GetActiveLine(); // Only replace if the user hasn't changed the selection to something else - if (active_line->Text.Mid(word_start, word_end - word_start) == orig_word->GetValue()) { - active_line->Text = active_line->Text.Left(word_start) + replace_word->GetValue() + active_line->Text.Mid(word_end); + if (active_line->Text.Mid(word_start, word_len) == orig_word->GetValue()) { + active_line->Text.replace(word_start, word_len, replace_word->GetValue()); context->ass->Commit(_("spell check replace"), AssFile::COMMIT_DIAG_TEXT); context->textSelectionController->SetInsertionPoint(word_start + replace_word->GetValue().size()); } } -void DialogSpellChecker::SetWord(wxString const& word) { - orig_word->SetValue(word); +void DialogSpellChecker::SetWord(std::string const& word) { + orig_word->SetValue(to_wx(word)); - wxArrayString suggestions = to_wx(spellchecker->GetSuggestions(from_wx(word))); - replace_word->SetValue(suggestions.size() ? suggestions[0] : word); + wxArrayString suggestions = to_wx(spellchecker->GetSuggestions(word)); + replace_word->SetValue(suggestions.size() ? suggestions[0] : to_wx(word)); suggest_list->Clear(); suggest_list->Append(suggestions); - context->textSelectionController->SetSelection(word_start, word_end); - context->textSelectionController->SetInsertionPoint(word_end); + context->textSelectionController->SetSelection(word_start, word_start + word_len); + context->textSelectionController->SetInsertionPoint(word_start + word_len); - add_button->Enable(spellchecker->CanAddWord(from_wx(word))); + add_button->Enable(spellchecker->CanAddWord(word)); } diff --git a/aegisub/src/dialog_spellchecker.h b/aegisub/src/dialog_spellchecker.h index 0fe012395..ca8a0a666 100644 --- a/aegisub/src/dialog_spellchecker.h +++ b/aegisub/src/dialog_spellchecker.h @@ -48,16 +48,16 @@ class DialogSpellChecker : public wxDialog { agi::scoped_ptr spellchecker; ///< The spellchecking engine /// Words which the user has indicated should always be corrected - std::map auto_replace; + std::map auto_replace; /// Words which the user has temporarily added to the dictionary - std::set auto_ignore; + std::set auto_ignore; /// Dictionaries available wxArrayString dictionary_lang_codes; int word_start; ///< Start index of the current misspelled word - int word_end; ///< End index of the current misspelled word + int word_len; ///< Length of the current misspelled word wxTextCtrl *orig_word; ///< The word being corrected wxTextCtrl *replace_word; ///< The replacement that will be used if "Replace" is clicked @@ -83,7 +83,7 @@ class DialogSpellChecker : public wxDialog { bool CheckLine(AssDialogue *active_line, int start_pos, int *commit_id); /// Set the current word to be corrected - void SetWord(wxString const& word); + void SetWord(std::string const& word); /// Correct the currently selected word void Replace(); diff --git a/aegisub/src/scintilla_text_ctrl.cpp b/aegisub/src/scintilla_text_ctrl.cpp index 95f5a25fa..36cc07761 100644 --- a/aegisub/src/scintilla_text_ctrl.cpp +++ b/aegisub/src/scintilla_text_ctrl.cpp @@ -82,32 +82,6 @@ void ScintillaTextCtrl::SetUnicodeStyling(int start,int length,int style) { SetStyling(len,style); } -/// @brief Get boundaries of word at position -void ScintillaTextCtrl::GetBoundsOfWordAtPosition(int pos,int &start,int &end) { - IntPairVector results; - GetWordBoundaries(GetText(), results); - - // Get boundaries - for (auto const& result : results) { - if (result.first <= pos && result.second >= pos) { - start = result.first; - end = result.second; - return; - } - } - - // Word not found - start = 0; - end = 0; -} - -/// @brief Get word at specified position -wxString ScintillaTextCtrl::GetWordAtPosition(int pos) { - int start,end; - GetBoundsOfWordAtPosition(pos, start, end); - return GetText().Mid(start, end - start); -} - /// @brief Set selection, unicode-aware void ScintillaTextCtrl::SetSelectionU(int start, int end) { SetSelection(GetUnicodePosition(start),GetUnicodePosition(end)); diff --git a/aegisub/src/scintilla_text_ctrl.h b/aegisub/src/scintilla_text_ctrl.h index ee121517a..c0d627b1d 100644 --- a/aegisub/src/scintilla_text_ctrl.h +++ b/aegisub/src/scintilla_text_ctrl.h @@ -34,6 +34,8 @@ #ifndef AGI_PRE #include + +#include #endif /// DOCME @@ -46,8 +48,6 @@ class ScintillaTextCtrl : public wxStyledTextCtrl { void OnMouseWheel(wxMouseEvent& evt); public: - wxString GetWordAtPosition(int pos); - void GetBoundsOfWordAtPosition(int pos,int &start,int &end); int GetUnicodePosition(int pos); int GetReverseUnicodePosition(int pos); diff --git a/aegisub/src/subs_edit_ctrl.cpp b/aegisub/src/subs_edit_ctrl.cpp index 847d31d41..eb3d5fb41 100644 --- a/aegisub/src/subs_edit_ctrl.cpp +++ b/aegisub/src/subs_edit_ctrl.cpp @@ -213,6 +213,7 @@ void SubsTextEditCtrl::UpdateStyle() { line_text = move(text); } tokenized_line = agi::ass::TokenizeDialogueBody(line_text); + agi::ass::SplitWords(line_text, tokenized_line); cursor_pos = -1; UpdateCallTip(); @@ -298,15 +299,13 @@ void SubsTextEditCtrl::Paste() { void SubsTextEditCtrl::OnContextMenu(wxContextMenuEvent &event) { wxPoint pos = event.GetPosition(); int activePos; - if (pos == wxDefaultPosition) { + if (pos == wxDefaultPosition) activePos = GetCurrentPos(); - } - else { + else activePos = PositionFromPoint(ScreenToClient(pos)); - } - currentWordPos = GetReverseUnicodePosition(activePos); - currentWord = from_wx(GetWordAtPosition(currentWordPos)); + currentWordPos = GetBoundsOfWordAtPosition(activePos); + currentWord = line_text.substr(currentWordPos.first, currentWordPos.second); wxMenu menu; if (!currentWord.empty()) { @@ -431,27 +430,22 @@ void SubsTextEditCtrl::OnAddToDictionary(wxCommandEvent &) { void SubsTextEditCtrl::OnUseSuggestion(wxCommandEvent &event) { std::string suggestion; int sugIdx = event.GetId() - EDIT_MENU_THESAURUS_SUGS; - if (sugIdx >= 0) { - suggestion = lagi_wxString(thesSugs[sugIdx]); - } - else { + if (sugIdx >= 0) + suggestion = from_wx(thesSugs[sugIdx]); + else suggestion = sugs[event.GetId() - EDIT_MENU_SUGGESTIONS]; - } // Strip suggestion of parenthesis size_t pos = suggestion.find("("); if (pos != suggestion.npos) suggestion.resize(pos - 1); - // Get boundaries of text being replaced - int start, end; - GetBoundsOfWordAtPosition(currentWordPos, start, end); + // line_text needs to get cleared before SetTextRaw to ensure it gets reparsed + std::string new_text; + swap(line_text, new_text); + SetTextRaw(new_text.replace(currentWordPos.first, currentWordPos.second, suggestion).c_str()); - wxString text = GetText(); - SetText(text.Left(std::max(0, start)) + to_wx(suggestion) + text.Mid(end)); - - // Set selection - SetSelectionU(start, start+suggestion.size()); + SetSelection(currentWordPos.first, currentWordPos.first + suggestion.size()); SetFocus(); } @@ -480,3 +474,17 @@ void SubsTextEditCtrl::OnSetThesLanguage(wxCommandEvent &event) { UpdateStyle(); } + +std::pair SubsTextEditCtrl::GetBoundsOfWordAtPosition(int pos) { + int len = 0; + for (auto const& tok : tokenized_line) { + if ((int)tok.length > pos) { + if (tok.type == agi::ass::DialogueTokenType::WORD) + return std::make_pair(len, tok.length); + return std::make_pair(0, 0); + } + len += tok.length; + } + + return std::make_pair(0, 0); +} diff --git a/aegisub/src/subs_edit_ctrl.h b/aegisub/src/subs_edit_ctrl.h index 00cf6983e..9dc4bfdd2 100644 --- a/aegisub/src/subs_edit_ctrl.h +++ b/aegisub/src/subs_edit_ctrl.h @@ -68,7 +68,7 @@ class SubsTextEditCtrl : public ScintillaTextCtrl { std::string currentWord; /// The beginning of the word right-clicked on, for spellchecker replacing - int currentWordPos; + std::pair currentWordPos; /// Spellchecker suggestions for the last right-clicked word std::vector sugs; @@ -129,5 +129,7 @@ public: void SetTextTo(wxString const& text); void Paste(); + std::pair GetBoundsOfWordAtPosition(int pos); + DECLARE_EVENT_TABLE() }; diff --git a/aegisub/src/utils.cpp b/aegisub/src/utils.cpp index 6a796f7bb..7716c682d 100644 --- a/aegisub/src/utils.cpp +++ b/aegisub/src/utils.cpp @@ -118,75 +118,6 @@ int SmallestPowerOf2(int x) { return x; } -void GetWordBoundaries(wxString const& text, IntPairVector &results, int start, int end) { - int depth = 0; - bool in_draw_mode = false; - if (end < 0) end = text.size(); - - // Delimiters - const wxUniChar delims[] = { - 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, - 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x003a, - 0x003b, 0x003d, 0x003f, 0x0040, 0x005b, 0x005c, 0x005d, 0x005e, - 0x005f, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 0x00a1, 0x00a2, - 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00aa, 0x00ab, - 0x00b0, 0x00b6, 0x00b7, 0x00ba, 0x00bb, 0x00bf, 0x02dc, 0x0e3f, - 0x2010, 0x2013, 0x2014, 0x2015, 0x2018, 0x2019, 0x201c, 0x201d, - 0x2020, 0x2021, 0x2022, 0x2025, 0x2026, 0x2026, 0x2030, 0x2031, - 0x2032, 0x203b, 0x203b, 0x203d, 0x2042, 0x2044, 0x20a6, 0x20a9, - 0x20aa, 0x20ac, 0x20ad, 0x2116, 0x2234, 0x2235, 0x2420, 0x2422, - 0x2423, 0x2506, 0x25ca, 0x2605, 0x261e, 0x2e2e, 0x3000, 0x3001, - 0x3002, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c, 0x300d, 0x300e, - 0x300f, 0x3010, 0x3011, 0x3014, 0x3015, 0x3016, 0x3017, 0x3018, - 0x3019, 0x301a, 0x301b, 0x301c, 0x3030, 0x303d, 0x30fb, 0xff0a, - 0xff5b, 0xff5d, 0xff5e - }; - - for (int i = start; i < end + 1; ++i) { - // Current character - wxUniChar cur = i < end ? text[i] : wxUniChar('.'); - - // Increase depth - if (cur == '{') { - depth++; - if (depth == 1 && start != i && !in_draw_mode) - results.push_back(std::make_pair(start, i)); - } - // Decrease depth - else if (cur == '}') { - depth--; - start = i + 1; - } - else if (depth > 0) { - // Check for draw mode - if (cur == '\\' && i + 1 < end && text[i + 1] == 'p') { - i += 2; - - // Eat leading zeros - while (i < end && text[i] == '0') ++i; - - in_draw_mode = i < end && text[i] >= '0' && text[i] <= '9'; - if (!in_draw_mode) --i; - } - } - else if (!in_draw_mode) { - // Check if it is \n or \N - if (cur == '\\' && i < end-1 && (text[i+1] == 'N' || text[i+1] == 'n' || text[i+1] == 'h')) { - if (start != i) - results.push_back(std::make_pair(start, i)); - start = i + 2; - i++; - } - // Check for standard delimiters - else if (std::binary_search(delims, delims + sizeof(delims) / sizeof(delims[0]), cur)) { - if (start != i) - results.push_back(std::make_pair(start, i)); - start = i + 1; - } - } - } -} - bool IsWhitespace(wchar_t c) { const wchar_t whitespaces[] = { diff --git a/aegisub/src/utils.h b/aegisub/src/utils.h index b8e695bf0..caa03dc42 100644 --- a/aegisub/src/utils.h +++ b/aegisub/src/utils.h @@ -49,8 +49,6 @@ class wxMouseEvent; class wxWindow; -typedef std::vector > IntPairVector; - /// @brief Make a path relative to reference wxString MakeRelativePath(wxString path,wxString reference); /// @brief Extract original path from relative @@ -64,16 +62,6 @@ wxString PrettySize(int bytes); /// Algorithm from http://bob.allegronetwork.com/prog/tricks.html int SmallestPowerOf2(int x); -/// Get the indices in text which are the beginnings of words -/// @param text Text to split into words -/// @param[out] results Vector of indices which are the beginnings of words -/// @param start First index in text to check -/// @param end Last index in text to check, or -1 for end -/// -/// This is ASS-specific and not a general purpose word boundary finder; words -/// within override blocks or drawing blocks are ignored -void GetWordBoundaries(wxString const& text, IntPairVector &results, int start=0, int end=-1); - /// Check if wchar 'c' is a whitespace character bool IsWhitespace(wchar_t c); diff --git a/aegisub/tests/Makefile b/aegisub/tests/Makefile index 7b8adfc7f..78e4454a5 100644 --- a/aegisub/tests/Makefile +++ b/aegisub/tests/Makefile @@ -25,14 +25,15 @@ SRC = \ libaegisub_iconv.cpp \ libaegisub_keyframe.cpp \ libaegisub_line_iterator.cpp \ + libaegisub_line_wrap.cpp \ libaegisub_option.cpp \ libaegisub_mru.cpp \ libaegisub_signals.cpp \ libaegisub_thesaurus.cpp \ libaegisub_util.cpp \ libaegisub_vfr.cpp \ - libaegisub_line_wrap.cpp - + libaegisub_word_split.cpp + HEADER = \ *.h diff --git a/aegisub/tests/libaegisub_syntax_highlight.cpp b/aegisub/tests/libaegisub_syntax_highlight.cpp new file mode 100644 index 000000000..d0a0fee0e --- /dev/null +++ b/aegisub/tests/libaegisub_syntax_highlight.cpp @@ -0,0 +1,151 @@ +// Copyright (c) 2012, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#include +#include + +#include "main.h" + +class MockSpellChecker : public agi::SpellChecker { + void AddWord(std::string const&) { } + bool CanAddWord(std::string const&) { return false; } + std::vector GetSuggestions(std::string const&) { return std::vector(); } + std::vector GetLanguageList() { return std::vector(); } + bool CheckWord(std::string const& word) { return word != "incorrect"; } +}; + +using namespace agi::ass; +namespace dt = DialogueTokenType; +namespace ss = SyntaxStyle; + +class lagi_syntax : public libagi { }; + +TEST(lagi_syntax, empty) { + std::string text; + std::vector tokens; + + EXPECT_TRUE(SyntaxHighlight(text, tokens, false, 0).empty()); + + tokens.emplace_back(dt::TEXT, 0); + auto syntax = SyntaxHighlight(text, tokens, false, 0); + EXPECT_EQ(1u, syntax.size()); + EXPECT_EQ(ss::NORMAL, syntax[0].type); +} + +#define tok_str(arg1, ...) do { \ + MockSpellChecker spellchecker; \ + std::string str = arg1; \ + std::vector tok = TokenizeDialogueBody(str); \ + SplitWords(str, tok); \ + std::vector styles = SyntaxHighlight(str, tok, false, &spellchecker); \ + size_t token_index = 0; \ + __VA_ARGS__ \ + EXPECT_EQ(token_index, styles.size()); \ +} while(false) + +#define expect_style(expected_type, expected_len) do { \ + EXPECT_LT(token_index, styles.size()); \ + if (token_index < styles.size()) { \ + EXPECT_EQ(expected_type, styles[token_index].type); \ + EXPECT_EQ(expected_len, styles[token_index].length); \ + ++token_index; \ + } \ +} while(false) + +TEST(lagi_syntax, spellcheck) { + tok_str("correct incorrect correct", + expect_style(ss::NORMAL, 8u); + expect_style(ss::SPELLING, 9u); + expect_style(ss::NORMAL, 8u); + ); +} + +TEST(lagi_syntax, drawing) { + tok_str("incorrect{\\p1}m 10 10{\\p}correct", + expect_style(ss::SPELLING, 9u); + expect_style(ss::OVERRIDE, 1u); + expect_style(ss::PUNCTUATION, 1u); + expect_style(ss::TAG, 1u); + expect_style(ss::PARAMETER, 1u); + expect_style(ss::OVERRIDE, 1u); + expect_style(ss::DRAWING, 7u); + expect_style(ss::OVERRIDE, 1u); + expect_style(ss::PUNCTUATION, 1u); + expect_style(ss::TAG, 1u); + expect_style(ss::OVERRIDE, 1u); + expect_style(ss::NORMAL, 7u); + ); +} + +TEST(lagi_syntax, transform) { + tok_str("{\\t(0, 0, \\clip(0,0,10,10)}clipped text", + expect_style(ss::OVERRIDE, 1u); + expect_style(ss::PUNCTUATION, 1u); + expect_style(ss::TAG, 1u); + expect_style(ss::PUNCTUATION, 1u); + expect_style(ss::PARAMETER, 1u); + expect_style(ss::PUNCTUATION, 1u); + expect_style(ss::NORMAL, 1u); + expect_style(ss::PARAMETER, 1u); + expect_style(ss::PUNCTUATION, 1u); + expect_style(ss::NORMAL, 1u); + expect_style(ss::PUNCTUATION, 1u); + expect_style(ss::TAG, 4u); + expect_style(ss::PUNCTUATION, 1u); + expect_style(ss::PARAMETER, 1u); + expect_style(ss::PUNCTUATION, 1u); + expect_style(ss::PARAMETER, 1u); + expect_style(ss::PUNCTUATION, 1u); + expect_style(ss::PARAMETER, 2u); + expect_style(ss::PUNCTUATION, 1u); + expect_style(ss::PARAMETER, 2u); + expect_style(ss::PUNCTUATION, 1u); + expect_style(ss::OVERRIDE, 1u); + expect_style(ss::NORMAL, 12u); + ); +} + +TEST(lagi_syntax, unclosed) { + tok_str("{\\incorrect}{\\incorrect", + expect_style(ss::OVERRIDE, 1u); + expect_style(ss::PUNCTUATION, 1u); + expect_style(ss::TAG, 9u); + expect_style(ss::OVERRIDE, 1u); + expect_style(ss::NORMAL, 2u); + expect_style(ss::SPELLING, 9u); + ); +} + +TEST(lagi_syntax, comment) { + tok_str("abc{def}ghi", + expect_style(ss::NORMAL, 3u); + expect_style(ss::OVERRIDE, 1u); + expect_style(ss::COMMENT, 3u); + expect_style(ss::OVERRIDE, 1u); + expect_style(ss::NORMAL, 3u); + ); +} + +TEST(lagi_syntax, linebreak) { + tok_str("a\\Nb\\nc\\hd\\N\\N", + expect_style(ss::NORMAL, 1u); + expect_style(ss::LINE_BREAK, 2u); + expect_style(ss::NORMAL, 1u); + expect_style(ss::LINE_BREAK, 2u); + expect_style(ss::NORMAL, 1u); + expect_style(ss::LINE_BREAK, 2u); + expect_style(ss::NORMAL, 1u); + expect_style(ss::LINE_BREAK, 4u); + ); +} diff --git a/aegisub/tests/libaegisub_word_split.cpp b/aegisub/tests/libaegisub_word_split.cpp new file mode 100644 index 000000000..7178f1de6 --- /dev/null +++ b/aegisub/tests/libaegisub_word_split.cpp @@ -0,0 +1,135 @@ +// Copyright (c) 2012, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +#include "main.h" + +#include + +class lagi_word_split : public libagi { }; + +using namespace agi::ass; +namespace dt = DialogueTokenType; + +TEST(lagi_word_split, empty) { + std::string text; + std::vector tokens; + + SplitWords(text, tokens); + EXPECT_TRUE(tokens.empty()); + + tokens.emplace_back(0, 0); + SplitWords(text, tokens); + EXPECT_EQ(1u, tokens.size()); +} + +TEST(lagi_word_split, one_word) { + std::string text = "abc"; + std::vector tokens = {{dt::TEXT, 3}}; + + SplitWords(text, tokens); + ASSERT_EQ(1u, tokens.size()); + EXPECT_EQ(dt::WORD, tokens[0].type); +} + +TEST(lagi_word_split, two_words_space) { + std::string text = "abc def"; + std::vector tokens = {{dt::TEXT, 7}}; + + SplitWords(text, tokens); + ASSERT_EQ(3u, tokens.size()); + EXPECT_EQ(dt::WORD, tokens[0].type); + EXPECT_EQ(3u, tokens[0].length); + EXPECT_EQ(dt::TEXT, tokens[1].type); + EXPECT_EQ(1u, tokens[1].length); + EXPECT_EQ(dt::WORD, tokens[2].type); + EXPECT_EQ(3u, tokens[2].length); +} + +TEST(lagi_word_split, two_words_newline) { + std::string text = "abc\\Ndef"; + std::vector tokens = { + {dt::TEXT, 3}, + {dt::LINE_BREAK, 2}, + {dt::TEXT, 3} + }; + + SplitWords(text, tokens); + ASSERT_EQ(3u, tokens.size()); + EXPECT_EQ(dt::WORD, tokens[0].type); + EXPECT_EQ(3u, tokens[0].length); + EXPECT_EQ(dt::LINE_BREAK, tokens[1].type); + EXPECT_EQ(2u, tokens[1].length); + EXPECT_EQ(dt::WORD, tokens[2].type); + EXPECT_EQ(3u, tokens[2].length); +} + +TEST(lagi_word_split, two_words_unicode) { + std::string text = u8"abc\u300adef"; + std::vector tokens = {{dt::TEXT, 9}}; + + SplitWords(text, tokens); + ASSERT_EQ(3u, tokens.size()); + EXPECT_EQ(dt::WORD, tokens[0].type); + EXPECT_EQ(3u, tokens[0].length); + EXPECT_EQ(dt::TEXT, tokens[1].type); + EXPECT_EQ(3u, tokens[1].length); + EXPECT_EQ(dt::WORD, tokens[2].type); + EXPECT_EQ(3u, tokens[2].length); +} + +TEST(lagi_word_split, drawing) { + std::string text = "a b{\\p1}m 10{\\p0}c"; + std::vector tokens = { + {dt::TEXT, 3}, + {dt::OVR_BEGIN, 1}, + {dt::TAG_START, 1}, + {dt::TAG_NAME, 1}, + {dt::ARG, 1}, + {dt::OVR_END, 1}, + {dt::TEXT, 4}, + {dt::OVR_BEGIN, 1}, + {dt::TAG_START, 1}, + {dt::TAG_NAME, 1}, + {dt::ARG, 1}, + {dt::OVR_END, 1}, + {dt::TEXT, 1} + }; + + SplitWords(text, tokens); + + ASSERT_EQ(15u, tokens.size()); + EXPECT_EQ(dt::WORD, tokens[0].type); + EXPECT_EQ(dt::WORD, tokens[2].type); + EXPECT_EQ(dt::WORD, tokens[14].type); + + EXPECT_EQ(dt::DRAWING, tokens[8].type); +} + +TEST(lagi_word_split, unclosed_ovr) { + std::string text = "a{\\b"; + std::vector tokens = { + {dt::TEXT, 1}, + {dt::OVR_BEGIN, 1}, + {dt::TAG_START, 1}, + {dt::TAG_NAME, 1} + }; + + SplitWords(text, tokens); + ASSERT_EQ(4u, tokens.size()); + EXPECT_EQ(dt::WORD, tokens[0].type); + EXPECT_EQ(dt::TEXT, tokens[1].type); + EXPECT_EQ(dt::TEXT, tokens[2].type); + EXPECT_EQ(dt::WORD, tokens[3].type); +} +