Extract word-splitting logic from the syntax highlighter

2025-04-11 22:56:02 +02:00 · 2012-11-06 16:26:00 -08:00 · 2012-11-06 16:26:00 -08:00 · 88fdee726b
commit 88fdee726b
parent 24c21dd425
13 changed files with 482 additions and 230 deletions
--- a/aegisub/libaegisub/ass/dialogue_parser.cpp
+++ b/aegisub/libaegisub/ass/dialogue_parser.cpp
@ -26,49 +26,76 @@
 namespace {

 typedef std::vector<agi::ass::DialogueToken> TokenVec;
-namespace dt = agi::ass::DialogueTokenType;
-namespace ss = agi::ass::SyntaxStyle;
+using namespace agi::ass;
+namespace dt = DialogueTokenType;
+namespace ss = SyntaxStyle;

 class SyntaxHighlighter {
 	TokenVec ranges;
 	std::string const& text;
 	agi::SpellChecker *spellchecker;
-	agi::scoped_holder<iconv_t, int(&)(iconv_t)> utf8_to_utf32;

 	void SetStyling(int len, int type) {
 		if (ranges.size() && ranges.back().type == type)
 			ranges.back().length += len;
 		else
-			ranges.push_back(agi::ass::DialogueToken(type, len));
+			ranges.push_back(DialogueToken(type, len));
 	}

-	void CheckWord(int start, int end) {
-		int len = end - start;
-		if (!len) return;
+public:
+	SyntaxHighlighter(std::string const& text, agi::SpellChecker *spellchecker)
+	: text(text)
+	, spellchecker(spellchecker)
+	{ }

-		if (!spellchecker->CheckWord(text.substr(start, len)))
-			SetStyling(len, ss::SPELLING);
-		else
-			SetStyling(len, ss::NORMAL);
+	TokenVec Highlight(TokenVec const& tokens, bool template_line) {
+		if (tokens.empty()) return ranges;
+
+		size_t pos = 0;
+
+		for (size_t i = 0; i < tokens.size(); ++i) {
+			size_t len = tokens[i].length;
+			switch (tokens[i].type) {
+				case dt::LINE_BREAK: SetStyling(len, ss::LINE_BREAK); break;
+				case dt::ERROR:      SetStyling(len, ss::ERROR);      break;
+				case dt::ARG:        SetStyling(len, ss::PARAMETER);  break;
+				case dt::COMMENT:    SetStyling(len, ss::COMMENT);    break;
+				case dt::WHITESPACE: SetStyling(len, ss::NORMAL);     break;
+				case dt::DRAWING:    SetStyling(len, ss::DRAWING);    break;
+				case dt::TEXT:       SetStyling(len, ss::NORMAL);     break;
+				case dt::TAG_NAME:   SetStyling(len, ss::TAG);        break;
+				case dt::OPEN_PAREN: case dt::CLOSE_PAREN: case dt::ARG_SEP: case dt::TAG_START:
+					SetStyling(len, ss::PUNCTUATION);
+					break;
+				case dt::OVR_BEGIN: case dt::OVR_END:
+					SetStyling(len, ss::OVERRIDE);
+					break;
+				case dt::WORD:
+					if (spellchecker && !spellchecker->CheckWord(text.substr(pos, len)))
+						SetStyling(len, ss::SPELLING);
+					else
+						SetStyling(len, ss::NORMAL);
+					break;
+			}
+
+			pos += len;
+			// karaoke templater
+		}
+
+		return ranges;
 	}
+};

-	int NextChar(int pos, int len, int& char_len) {
-		int chr = 0;
-		char *inptr = const_cast<char *>(&text[pos]);
-		size_t inlen = len;
-		char *outptr = (char *)&chr;
-		size_t outlen = sizeof chr;
+class WordSplitter {
+	std::string const& text;
+	std::vector<DialogueToken> &tokens;
+	agi::scoped_holder<iconv_t, int(&)(iconv_t)> utf8_to_utf32;
+	size_t last_ovr_end;
+	size_t pos;
+	bool in_drawing;

-		iconv(utf8_to_utf32, &inptr, &inlen, &outptr, &outlen);
-		if (outlen != 0)
-			return 0;
-
-		char_len = len - inlen;
-		return chr;
-	}
-
-	void StyleSpellCheck(int pos, int len) {
-		const int delims[] = {
+	bool IsWordSep(int chr) {
+		static const int delims[] = {
 			0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028,
 			0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x003a,
 			0x003b, 0x003d, 0x003f, 0x0040, 0x005b, 0x005c, 0x005d, 0x005e,
@ -86,38 +113,72 @@ class SyntaxHighlighter {
 			0xff5b, 0xff5d, 0xff5e
 		};

-		int chrlen = 0;
-		int start = pos;
-		for (; len > 0; pos += chrlen, len -= chrlen) {
-			int chr = NextChar(pos, len, chrlen);
-			if (!chr) return;
+		return std::binary_search(std::begin(delims), std::end(delims), chr);
+	}

-			if (std::binary_search(std::begin(delims), std::end(delims), chr)) {
-				CheckWord(start, pos);
-				SetStyling(1, ss::NORMAL);
-				start = pos + 1;
-			}
+	int NextChar(int pos, int len, int& char_len) {
+		int chr = 0;
+		char *inptr = const_cast<char *>(&text[pos]);
+		size_t inlen = len;
+		char *outptr = (char *)&chr;
+		size_t outlen = sizeof chr;
+
+		iconv(utf8_to_utf32, &inptr, &inlen, &outptr, &outlen);
+		if (outlen != 0)
+			return 0;
+
+		char_len = len - inlen;
+		return chr;
+	}
+
+	void SwitchTo(size_t &i, int type, int len) {
+		if (tokens[i].type == type) return;
+
+		if (tokens[i].length == (size_t)len)
+			tokens[i].type = type;
+		else {
+			tokens.insert(tokens.begin() + i + 1, DialogueToken(type, len));
+			tokens[i].length -= len;
+			++i;
+			++last_ovr_end;
+		}
+	}
+
+	void SplitText(size_t &i) {
+		if (in_drawing) {
+			tokens[i].type = dt::DRAWING;
+			return;
 		}

-		CheckWord(start, pos);
+		int chrlen = 0;
+		int len = tokens[i].length;
+		int tpos = pos;
+		for (; len > 0; tpos += chrlen, len -= chrlen) {
+			int chr = NextChar(tpos, len, chrlen);
+			if (!chr) return;
+
+			if (IsWordSep(chr))
+				SwitchTo(i, dt::TEXT, len);
+			else
+				SwitchTo(i, dt::WORD, len);
+		}
 	}

 public:
-	SyntaxHighlighter(std::string const& text, agi::SpellChecker *spellchecker)
+	WordSplitter(std::string const& text, std::vector<DialogueToken> &tokens)
 	: text(text)
-	, spellchecker(spellchecker)
+	, tokens(tokens)
 	, utf8_to_utf32(iconv_open("utf-32le", "utf-8"), iconv_close)
+	, last_ovr_end(0)
+	, pos(0)
+	, in_drawing(false)
 	{ }

-	TokenVec Highlight(TokenVec const& tokens, bool template_line) {
-		if (tokens.empty()) return ranges;
-
-		bool in_drawing = false;
-		size_t pos = 0;
+	void SplitWords() {
+		if (tokens.empty()) return;

 		// VSFilter treats unclosed override blocks as plain text, so pretend
 		// all tokens after the last override block are TEXT
-		size_t last_ovr_end = 0;
 		for (size_t i = tokens.size(); i > 0; --i) {
 			if (tokens[i - 1].type == dt::OVR_END) {
 				last_ovr_end = i - 1;
@ -127,30 +188,14 @@ public:

 		for (size_t i = 0; i < tokens.size(); ++i) {
 			size_t len = tokens[i].length;
-			switch (i > last_ovr_end ? dt::TEXT : tokens[i].type) {
-				case dt::LINE_BREAK: SetStyling(len, ss::LINE_BREAK); break;
-				case dt::ERROR:      SetStyling(len, ss::ERROR); break;
-				case dt::ARG:        SetStyling(len, ss::PARAMETER); break;
-				case dt::COMMENT:    SetStyling(len, ss::COMMENT); break;
-				case dt::WHITESPACE: SetStyling(len, ss::NORMAL); break;
-				case dt::OPEN_PAREN: case dt::CLOSE_PAREN: case dt::ARG_SEP: case dt::TAG_START:
-					SetStyling(len, ss::PUNCTUATION);
-					break;
-				case dt::OVR_BEGIN: case dt::OVR_END:
-					SetStyling(len, ss::OVERRIDE);
-					break;
-
-				case dt::TEXT:
-					if (in_drawing)
-						SetStyling(len, ss::DRAWING);
-					else if (spellchecker)
-						StyleSpellCheck(pos, len);
-					else
-						SetStyling(len, ss::NORMAL);
-					break;
-
+			switch (tokens[i].type) {
+				case dt::LINE_BREAK: break;
+				case dt::TEXT: SplitText(i); break;
 				case dt::TAG_NAME:
-					SetStyling(len, ss::TAG);
+					if (i > last_ovr_end) {
+						SplitText(i);
+						break;
+					}

 					if (len != 1 || i + 1 >= tokens.size() || text[pos] != 'p')
 						break;
@ -170,13 +215,14 @@ public:
 							break;
 					}
 					break;
+				default:
+					if (i > last_ovr_end)
+						SplitText(i);
+					break;
 			}

 			pos += len;
-			// karaoke templater
 		}
-
-		return ranges;
 	}
 };
 }
@ -188,5 +234,9 @@ std::vector<DialogueToken> SyntaxHighlight(std::string const& text, std::vector<
 	return SyntaxHighlighter(text, spellchecker).Highlight(tokens, template_line);
 }

+void SplitWords(std::string const& str, std::vector<DialogueToken> &tokens) {
+	WordSplitter(str, tokens).SplitWords();
+}
+
 }
 }
--- a/aegisub/libaegisub/include/libaegisub/ass/dialogue_parser.h
+++ b/aegisub/libaegisub/include/libaegisub/ass/dialogue_parser.h
@ -27,6 +27,7 @@ namespace agi {
 		namespace DialogueTokenType {
 			enum {
 				TEXT = 1000,
+				WORD,
 				LINE_BREAK,
 				OVR_BEGIN,
 				OVR_END,
@ -38,7 +39,8 @@ namespace agi {
 				ARG,
 				ERROR,
 				COMMENT,
-				WHITESPACE
+				WHITESPACE,
+				DRAWING
 			};
 		}

@ -66,8 +68,13 @@ namespace agi {
 			DialogueToken(int type, size_t length) : type(type), length(length) { }
 		};

+		/// Tokenize the passed string as the body of a dialogue line
 		std::vector<DialogueToken> TokenizeDialogueBody(std::string const& str);

+		/// Split the words in the TEXT tokens of the lexed line into their
+		/// own tokens and convert the body of drawings to DRAWING tokens
+		void SplitWords(std::string const& str, std::vector<DialogueToken> &tokens);
+
 		std::vector<DialogueToken> SyntaxHighlight(std::string const& text, std::vector<DialogueToken> const& tokens, bool template_line, SpellChecker *spellchecker);
 	}
 }
--- a/aegisub/src/dialog_spellchecker.cpp
+++ b/aegisub/src/dialog_spellchecker.cpp
@ -47,6 +47,7 @@
 #include "subs_edit_ctrl.h"
 #include "utils.h"

+#include <libaegisub/ass/dialogue_parser.h>
 #include <libaegisub/exception.h>
 #include <libaegisub/spellchecker.h>

@ -168,7 +169,7 @@ void DialogSpellChecker::OnReplace(wxCommandEvent&) {
 }

 void DialogSpellChecker::OnReplaceAll(wxCommandEvent&) {
-	auto_replace[orig_word->GetValue()] = replace_word->GetValue();
+	auto_replace[from_wx(orig_word->GetValue())] = from_wx(replace_word->GetValue());

 	Replace();
 	FindNext();
@ -179,7 +180,7 @@ void DialogSpellChecker::OnIgnore(wxCommandEvent&) {
 }

 void DialogSpellChecker::OnIgnoreAll(wxCommandEvent&) {
-	auto_ignore.insert(orig_word->GetValue());
+	auto_ignore.emplace(from_wx(orig_word->GetValue()));
 	FindNext();
 }

@ -247,19 +248,22 @@ bool DialogSpellChecker::FindNext() {
 bool DialogSpellChecker::CheckLine(AssDialogue *active_line, int start_pos, int *commit_id) {
 	if (active_line->Comment && skip_comments->GetValue()) return false;

-	IntPairVector results;
-	GetWordBoundaries(active_line->Text, results);
+	std::string text = from_wx(active_line->Text);
+	auto tokens = agi::ass::TokenizeDialogueBody(text);
+	agi::ass::SplitWords(text, tokens);

-	int shift = 0;
-	for (auto const& result : results) {
-		word_start = result.first + shift;
+	word_start = 0;
+	for (auto const& tok : tokens) {
+		word_start += tok.length;
+		if (tok.type != agi::ass::DialogueTokenType::WORD) continue;
 		if (word_start < start_pos) continue;
-		word_end = result.second + shift;
-		wxString word = active_line->Text.Mid(word_start, word_end - word_start);

-		if (auto_ignore.count(word) || spellchecker->CheckWord(from_wx(word))) continue;
+		word_len = tok.length;
+		std::string word = text.substr(word_start, word_len);

-		std::map<wxString, wxString>::const_iterator auto_rep = auto_replace.find(word);
+		if (auto_ignore.count(word) || spellchecker->CheckWord(word)) continue;
+
+		auto auto_rep = auto_replace.find(word);
 		if (auto_rep == auto_replace.end()) {
 #ifdef __WXGTK__
 			// http://trac.wxwidgets.org/ticket/14369
@ -274,9 +278,10 @@ bool DialogSpellChecker::CheckLine(AssDialogue *active_line, int start_pos, int
 			return true;
 		}

-		active_line->Text = active_line->Text.Left(word_start) + auto_rep->second + active_line->Text.Mid(word_end);
+		text.replace(word_start, word_len, auto_rep->second);
+		active_line->Text = from_wx(text);
 		*commit_id = context->ass->Commit(_("spell check replace"), AssFile::COMMIT_DIAG_TEXT, *commit_id);
-		shift += auto_rep->second.size() - auto_rep->first.size();
+		word_start += auto_rep->second.size() - auto_rep->first.size();
 	}
 	return false;
 }
@ -285,23 +290,23 @@ void DialogSpellChecker::Replace() {
 	AssDialogue *active_line = context->selectionController->GetActiveLine();

 	// Only replace if the user hasn't changed the selection to something else
-	if (active_line->Text.Mid(word_start, word_end - word_start) == orig_word->GetValue()) {
-		active_line->Text = active_line->Text.Left(word_start) + replace_word->GetValue() + active_line->Text.Mid(word_end);
+	if (active_line->Text.Mid(word_start, word_len) == orig_word->GetValue()) {
+		active_line->Text.replace(word_start, word_len, replace_word->GetValue());
 		context->ass->Commit(_("spell check replace"), AssFile::COMMIT_DIAG_TEXT);
 		context->textSelectionController->SetInsertionPoint(word_start + replace_word->GetValue().size());
 	}
 }

-void DialogSpellChecker::SetWord(wxString const& word) {
-	orig_word->SetValue(word);
+void DialogSpellChecker::SetWord(std::string const& word) {
+	orig_word->SetValue(to_wx(word));

-	wxArrayString suggestions = to_wx(spellchecker->GetSuggestions(from_wx(word)));
-	replace_word->SetValue(suggestions.size() ? suggestions[0] : word);
+	wxArrayString suggestions = to_wx(spellchecker->GetSuggestions(word));
+	replace_word->SetValue(suggestions.size() ? suggestions[0] : to_wx(word));
 	suggest_list->Clear();
 	suggest_list->Append(suggestions);

-	context->textSelectionController->SetSelection(word_start, word_end);
-	context->textSelectionController->SetInsertionPoint(word_end);
+	context->textSelectionController->SetSelection(word_start, word_start + word_len);
+	context->textSelectionController->SetInsertionPoint(word_start + word_len);

-	add_button->Enable(spellchecker->CanAddWord(from_wx(word)));
+	add_button->Enable(spellchecker->CanAddWord(word));
 }
--- a/aegisub/src/dialog_spellchecker.h
+++ b/aegisub/src/dialog_spellchecker.h
@ -48,16 +48,16 @@ class DialogSpellChecker : public wxDialog {
 	agi::scoped_ptr<agi::SpellChecker> spellchecker; ///< The spellchecking engine

 	/// Words which the user has indicated should always be corrected
-	std::map<wxString,wxString> auto_replace;
+	std::map<std::string, std::string> auto_replace;

 	/// Words which the user has temporarily added to the dictionary
-	std::set<wxString> auto_ignore;
+	std::set<std::string> auto_ignore;

 	/// Dictionaries available
 	wxArrayString dictionary_lang_codes;

 	int word_start; ///< Start index of the current misspelled word
-	int word_end;   ///< End index of the current misspelled word
+	int word_len;   ///< Length of the current misspelled word

 	wxTextCtrl *orig_word;    ///< The word being corrected
 	wxTextCtrl *replace_word; ///< The replacement that will be used if "Replace" is clicked
@ -83,7 +83,7 @@ class DialogSpellChecker : public wxDialog {
 	bool CheckLine(AssDialogue *active_line, int start_pos, int *commit_id);

 	/// Set the current word to be corrected
-	void SetWord(wxString const& word);
+	void SetWord(std::string const& word);
 	/// Correct the currently selected word
 	void Replace();

--- a/aegisub/src/scintilla_text_ctrl.cpp
+++ b/aegisub/src/scintilla_text_ctrl.cpp
@ -82,32 +82,6 @@ void ScintillaTextCtrl::SetUnicodeStyling(int start,int length,int style) {
 	SetStyling(len,style);
 }

-/// @brief Get boundaries of word at position
-void ScintillaTextCtrl::GetBoundsOfWordAtPosition(int pos,int &start,int &end) {
-	IntPairVector results;
-	GetWordBoundaries(GetText(), results);
-
-	// Get boundaries
-	for (auto const& result : results) {
-		if (result.first <= pos && result.second >= pos) {
-			start = result.first;
-			end = result.second;
-			return;
-		}
-	}
-
-	// Word not found
-	start = 0;
-	end = 0;
-}
-
-/// @brief Get word at specified position
-wxString ScintillaTextCtrl::GetWordAtPosition(int pos) {
-	int start,end;
-	GetBoundsOfWordAtPosition(pos, start, end);
-	return GetText().Mid(start, end - start);
-}
-
 /// @brief Set selection, unicode-aware
 void ScintillaTextCtrl::SetSelectionU(int start, int end) {
 	SetSelection(GetUnicodePosition(start),GetUnicodePosition(end));
--- a/aegisub/src/scintilla_text_ctrl.h
+++ b/aegisub/src/scintilla_text_ctrl.h
@ -34,6 +34,8 @@

 #ifndef AGI_PRE
 #include <wx/stc/stc.h>
+
+#include <string>
 #endif

 /// DOCME
@ -46,8 +48,6 @@ class ScintillaTextCtrl : public wxStyledTextCtrl {

 	void OnMouseWheel(wxMouseEvent& evt);
 public:
-	wxString GetWordAtPosition(int pos);
-	void GetBoundsOfWordAtPosition(int pos,int &start,int &end);
 	int GetUnicodePosition(int pos);
 	int GetReverseUnicodePosition(int pos);

--- a/aegisub/src/subs_edit_ctrl.cpp
+++ b/aegisub/src/subs_edit_ctrl.cpp
@ -213,6 +213,7 @@ void SubsTextEditCtrl::UpdateStyle() {
 		line_text = move(text);
 	}
 	tokenized_line = agi::ass::TokenizeDialogueBody(line_text);
+	agi::ass::SplitWords(line_text, tokenized_line);

 	cursor_pos = -1;
 	UpdateCallTip();
@ -298,15 +299,13 @@ void SubsTextEditCtrl::Paste() {
 void SubsTextEditCtrl::OnContextMenu(wxContextMenuEvent &event) {
 	wxPoint pos = event.GetPosition();
 	int activePos;
-	if (pos == wxDefaultPosition) {
+	if (pos == wxDefaultPosition)
 		activePos = GetCurrentPos();
-	}
-	else {
+	else
 		activePos = PositionFromPoint(ScreenToClient(pos));
-	}

-	currentWordPos = GetReverseUnicodePosition(activePos);
-	currentWord = from_wx(GetWordAtPosition(currentWordPos));
+	currentWordPos = GetBoundsOfWordAtPosition(activePos);
+	currentWord = line_text.substr(currentWordPos.first, currentWordPos.second);

 	wxMenu menu;
 	if (!currentWord.empty()) {
@ -431,27 +430,22 @@ void SubsTextEditCtrl::OnAddToDictionary(wxCommandEvent &) {
 void SubsTextEditCtrl::OnUseSuggestion(wxCommandEvent &event) {
 	std::string suggestion;
 	int sugIdx = event.GetId() - EDIT_MENU_THESAURUS_SUGS;
-	if (sugIdx >= 0) {
-		suggestion = lagi_wxString(thesSugs[sugIdx]);
-	}
-	else {
+	if (sugIdx >= 0)
+		suggestion = from_wx(thesSugs[sugIdx]);
+	else
 		suggestion = sugs[event.GetId() - EDIT_MENU_SUGGESTIONS];
-	}

 	// Strip suggestion of parenthesis
 	size_t pos = suggestion.find("(");
 	if (pos != suggestion.npos)
 		suggestion.resize(pos - 1);

-	// Get boundaries of text being replaced
-	int start, end;
-	GetBoundsOfWordAtPosition(currentWordPos, start, end);
+	// line_text needs to get cleared before SetTextRaw to ensure it gets reparsed
+	std::string new_text;
+	swap(line_text, new_text);
+	SetTextRaw(new_text.replace(currentWordPos.first, currentWordPos.second, suggestion).c_str());

-	wxString text = GetText();
-	SetText(text.Left(std::max(0, start)) + to_wx(suggestion) + text.Mid(end));
-
-	// Set selection
-	SetSelectionU(start, start+suggestion.size());
+	SetSelection(currentWordPos.first, currentWordPos.first + suggestion.size());
 	SetFocus();
 }

@ -480,3 +474,17 @@ void SubsTextEditCtrl::OnSetThesLanguage(wxCommandEvent &event) {

 	UpdateStyle();
 }
+
+std::pair<int, int> SubsTextEditCtrl::GetBoundsOfWordAtPosition(int pos) {
+	int len = 0;
+	for (auto const& tok : tokenized_line) {
+		if ((int)tok.length > pos) {
+			if (tok.type == agi::ass::DialogueTokenType::WORD)
+				return std::make_pair(len, tok.length);
+			return std::make_pair(0, 0);
+		}
+		len += tok.length;
+	}
+
+	return std::make_pair(0, 0);
+}
--- a/aegisub/src/subs_edit_ctrl.h
+++ b/aegisub/src/subs_edit_ctrl.h
@ -68,7 +68,7 @@ class SubsTextEditCtrl : public ScintillaTextCtrl {
 	std::string currentWord;

 	/// The beginning of the word right-clicked on, for spellchecker replacing
-	int currentWordPos;
+	std::pair<int, int> currentWordPos;

 	/// Spellchecker suggestions for the last right-clicked word
 	std::vector<std::string> sugs;
@ -129,5 +129,7 @@ public:
 	void SetTextTo(wxString const& text);
 	void Paste();

+	std::pair<int, int> GetBoundsOfWordAtPosition(int pos);
+
 	DECLARE_EVENT_TABLE()
 };
--- a/aegisub/src/utils.cpp
+++ b/aegisub/src/utils.cpp
@ -118,75 +118,6 @@ int SmallestPowerOf2(int x) {
 	return x;
 }

-void GetWordBoundaries(wxString const& text, IntPairVector &results, int start, int end) {
-	int depth = 0;
-	bool in_draw_mode = false;
-	if (end < 0) end = text.size();
-
-	// Delimiters
-	const wxUniChar delims[] = {
-		0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028,
-		0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 0x003a,
-		0x003b, 0x003d, 0x003f, 0x0040, 0x005b, 0x005c, 0x005d, 0x005e,
-		0x005f, 0x0060, 0x007b, 0x007c, 0x007d, 0x007e, 0x00a1, 0x00a2,
-		0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00aa, 0x00ab,
-		0x00b0, 0x00b6, 0x00b7, 0x00ba, 0x00bb, 0x00bf, 0x02dc, 0x0e3f,
-		0x2010, 0x2013, 0x2014, 0x2015, 0x2018, 0x2019, 0x201c, 0x201d,
-		0x2020, 0x2021, 0x2022, 0x2025, 0x2026, 0x2026, 0x2030, 0x2031,
-		0x2032, 0x203b, 0x203b, 0x203d, 0x2042, 0x2044, 0x20a6, 0x20a9,
-		0x20aa, 0x20ac, 0x20ad, 0x2116, 0x2234, 0x2235, 0x2420, 0x2422,
-		0x2423, 0x2506, 0x25ca, 0x2605, 0x261e, 0x2e2e, 0x3000, 0x3001,
-		0x3002, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c, 0x300d, 0x300e,
-		0x300f, 0x3010, 0x3011, 0x3014, 0x3015, 0x3016, 0x3017, 0x3018,
-		0x3019, 0x301a, 0x301b, 0x301c, 0x3030, 0x303d, 0x30fb, 0xff0a,
-		0xff5b, 0xff5d, 0xff5e
-	};
-
-	for (int i = start; i < end + 1; ++i) {
-		// Current character
-		wxUniChar cur = i < end ? text[i] : wxUniChar('.');
-
-		// Increase depth
-		if (cur == '{') {
-			depth++;
-			if (depth == 1 && start != i && !in_draw_mode)
-				results.push_back(std::make_pair(start, i));
-		}
-		// Decrease depth
-		else if (cur == '}') {
-			depth--;
-			start = i + 1;
-		}
-		else if (depth > 0) {
-			// Check for draw mode
-			if (cur == '\\' && i + 1 < end && text[i + 1] == 'p') {
-				i += 2;
-
-				// Eat leading zeros
-				while (i < end && text[i] == '0') ++i;
-
-				in_draw_mode = i < end && text[i] >= '0' && text[i] <= '9';
-				if (!in_draw_mode) --i;
-			}
-		}
-		else if (!in_draw_mode) {
-			// Check if it is \n or \N
-			if (cur == '\\' && i < end-1 && (text[i+1] == 'N' || text[i+1] == 'n' || text[i+1] == 'h')) {
-				if (start != i)
-					results.push_back(std::make_pair(start, i));
-				start = i + 2;
-				i++;
-			}
-			// Check for standard delimiters
-			else if (std::binary_search(delims, delims + sizeof(delims) / sizeof(delims[0]), cur)) {
-				if (start != i)
-					results.push_back(std::make_pair(start, i));
-				start = i + 1;
-			}
-		}
-	}
-}
-
 bool IsWhitespace(wchar_t c)
 {
 	const wchar_t whitespaces[] = {
--- a/aegisub/src/utils.h
+++ b/aegisub/src/utils.h
@ -49,8 +49,6 @@
 class wxMouseEvent;
 class wxWindow;

-typedef std::vector<std::pair<int,int> > IntPairVector;
-
 /// @brief Make a path relative to reference
 wxString MakeRelativePath(wxString path,wxString reference);
 /// @brief Extract original path from relative
@ -64,16 +62,6 @@ wxString PrettySize(int bytes);
 /// Algorithm from http://bob.allegronetwork.com/prog/tricks.html
 int SmallestPowerOf2(int x);

-/// Get the indices in text which are the beginnings of words
-/// @param text Text to split into words
-/// @param[out] results Vector of indices which are the beginnings of words
-/// @param start First index in text to check
-/// @param end Last index in text to check, or -1 for end
-///
-/// This is ASS-specific and not a general purpose word boundary finder; words
-/// within override blocks or drawing blocks are ignored
-void GetWordBoundaries(wxString const& text, IntPairVector &results, int start=0, int end=-1);
-
 /// Check if wchar 'c' is a whitespace character
 bool IsWhitespace(wchar_t c);

--- a/aegisub/tests/Makefile
+++ b/aegisub/tests/Makefile
@ -25,14 +25,15 @@ SRC = \
 		libaegisub_iconv.cpp \
 		libaegisub_keyframe.cpp \
 		libaegisub_line_iterator.cpp \
+		libaegisub_line_wrap.cpp \
 		libaegisub_option.cpp \
 		libaegisub_mru.cpp \
 		libaegisub_signals.cpp \
 		libaegisub_thesaurus.cpp \
 		libaegisub_util.cpp \
 		libaegisub_vfr.cpp \
-		libaegisub_line_wrap.cpp
-		
+		libaegisub_word_split.cpp
+
 HEADER = \
 	*.h

--- a/aegisub/tests/libaegisub_syntax_highlight.cpp
+++ b/aegisub/tests/libaegisub_syntax_highlight.cpp
@ -0,0 +1,151 @@
+// Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
+//
+// Permission to use, copy, modify, and distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#include <libaegisub/ass/dialogue_parser.h>
+#include <libaegisub/spellchecker.h>
+
+#include "main.h"
+
+class MockSpellChecker : public agi::SpellChecker {
+	void AddWord(std::string const&) { }
+	bool CanAddWord(std::string const&) { return false; }
+	std::vector<std::string> GetSuggestions(std::string const&) { return std::vector<std::string>(); }
+	std::vector<std::string> GetLanguageList() { return std::vector<std::string>(); }
+	bool CheckWord(std::string const& word) { return word != "incorrect"; }
+};
+
+using namespace agi::ass;
+namespace dt = DialogueTokenType;
+namespace ss = SyntaxStyle;
+
+class lagi_syntax : public libagi { };
+
+TEST(lagi_syntax, empty) {
+	std::string text;
+	std::vector<DialogueToken> tokens;
+
+	EXPECT_TRUE(SyntaxHighlight(text, tokens, false, 0).empty());
+
+	tokens.emplace_back(dt::TEXT, 0);
+	auto syntax = SyntaxHighlight(text, tokens, false, 0);
+	EXPECT_EQ(1u, syntax.size());
+	EXPECT_EQ(ss::NORMAL, syntax[0].type);
+}
+
+#define tok_str(arg1, ...) do { \
+	MockSpellChecker spellchecker; \
+	std::string str = arg1; \
+	std::vector<DialogueToken> tok = TokenizeDialogueBody(str); \
+	SplitWords(str, tok); \
+	std::vector<DialogueToken> styles = SyntaxHighlight(str, tok, false, &spellchecker); \
+	size_t token_index = 0; \
+	__VA_ARGS__ \
+	EXPECT_EQ(token_index, styles.size()); \
+} while(false)
+
+#define expect_style(expected_type, expected_len) do { \
+	EXPECT_LT(token_index, styles.size()); \
+	if (token_index < styles.size()) { \
+		EXPECT_EQ(expected_type, styles[token_index].type); \
+		EXPECT_EQ(expected_len, styles[token_index].length); \
+		++token_index; \
+	} \
+} while(false)
+
+TEST(lagi_syntax, spellcheck) {
+	tok_str("correct incorrect correct",
+		expect_style(ss::NORMAL, 8u);
+		expect_style(ss::SPELLING, 9u);
+		expect_style(ss::NORMAL, 8u);
+	);
+}
+
+TEST(lagi_syntax, drawing) {
+	tok_str("incorrect{\\p1}m 10 10{\\p}correct",
+		expect_style(ss::SPELLING, 9u);
+		expect_style(ss::OVERRIDE, 1u);
+		expect_style(ss::PUNCTUATION, 1u);
+		expect_style(ss::TAG, 1u);
+		expect_style(ss::PARAMETER, 1u);
+		expect_style(ss::OVERRIDE, 1u);
+		expect_style(ss::DRAWING, 7u);
+		expect_style(ss::OVERRIDE, 1u);
+		expect_style(ss::PUNCTUATION, 1u);
+		expect_style(ss::TAG, 1u);
+		expect_style(ss::OVERRIDE, 1u);
+		expect_style(ss::NORMAL, 7u);
+	);
+}
+
+TEST(lagi_syntax, transform) {
+	tok_str("{\\t(0, 0, \\clip(0,0,10,10)}clipped text",
+		expect_style(ss::OVERRIDE, 1u);
+		expect_style(ss::PUNCTUATION, 1u);
+		expect_style(ss::TAG, 1u);
+		expect_style(ss::PUNCTUATION, 1u);
+		expect_style(ss::PARAMETER, 1u);
+		expect_style(ss::PUNCTUATION, 1u);
+		expect_style(ss::NORMAL, 1u);
+		expect_style(ss::PARAMETER, 1u);
+		expect_style(ss::PUNCTUATION, 1u);
+		expect_style(ss::NORMAL, 1u);
+		expect_style(ss::PUNCTUATION, 1u);
+		expect_style(ss::TAG, 4u);
+		expect_style(ss::PUNCTUATION, 1u);
+		expect_style(ss::PARAMETER, 1u);
+		expect_style(ss::PUNCTUATION, 1u);
+		expect_style(ss::PARAMETER, 1u);
+		expect_style(ss::PUNCTUATION, 1u);
+		expect_style(ss::PARAMETER, 2u);
+		expect_style(ss::PUNCTUATION, 1u);
+		expect_style(ss::PARAMETER, 2u);
+		expect_style(ss::PUNCTUATION, 1u);
+		expect_style(ss::OVERRIDE, 1u);
+		expect_style(ss::NORMAL, 12u);
+	);
+}
+
+TEST(lagi_syntax, unclosed) {
+	tok_str("{\\incorrect}{\\incorrect",
+		expect_style(ss::OVERRIDE, 1u);
+		expect_style(ss::PUNCTUATION, 1u);
+		expect_style(ss::TAG, 9u);
+		expect_style(ss::OVERRIDE, 1u);
+		expect_style(ss::NORMAL, 2u);
+		expect_style(ss::SPELLING, 9u);
+	);
+}
+
+TEST(lagi_syntax, comment) {
+	tok_str("abc{def}ghi",
+		expect_style(ss::NORMAL, 3u);
+		expect_style(ss::OVERRIDE, 1u);
+		expect_style(ss::COMMENT, 3u);
+		expect_style(ss::OVERRIDE, 1u);
+		expect_style(ss::NORMAL, 3u);
+	);
+}
+
+TEST(lagi_syntax, linebreak) {
+	tok_str("a\\Nb\\nc\\hd\\N\\N",
+		expect_style(ss::NORMAL, 1u);
+		expect_style(ss::LINE_BREAK, 2u);
+		expect_style(ss::NORMAL, 1u);
+		expect_style(ss::LINE_BREAK, 2u);
+		expect_style(ss::NORMAL, 1u);
+		expect_style(ss::LINE_BREAK, 2u);
+		expect_style(ss::NORMAL, 1u);
+		expect_style(ss::LINE_BREAK, 4u);
+	);
+}
--- a/aegisub/tests/libaegisub_word_split.cpp
+++ b/aegisub/tests/libaegisub_word_split.cpp
@ -0,0 +1,135 @@
+// Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
+//
+// Permission to use, copy, modify, and distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#include "main.h"
+
+#include <libaegisub/ass/dialogue_parser.h>
+
+class lagi_word_split : public libagi { };
+
+using namespace agi::ass;
+namespace dt = DialogueTokenType;
+
+TEST(lagi_word_split, empty) {
+	std::string text;
+	std::vector<DialogueToken> tokens;
+
+	SplitWords(text, tokens);
+	EXPECT_TRUE(tokens.empty());
+
+	tokens.emplace_back(0, 0);
+	SplitWords(text, tokens);
+	EXPECT_EQ(1u, tokens.size());
+}
+
+TEST(lagi_word_split, one_word) {
+	std::string text = "abc";
+	std::vector<DialogueToken> tokens = {{dt::TEXT, 3}};
+
+	SplitWords(text, tokens);
+	ASSERT_EQ(1u, tokens.size());
+	EXPECT_EQ(dt::WORD, tokens[0].type);
+}
+
+TEST(lagi_word_split, two_words_space) {
+	std::string text = "abc def";
+	std::vector<DialogueToken> tokens = {{dt::TEXT, 7}};
+
+	SplitWords(text, tokens);
+	ASSERT_EQ(3u, tokens.size());
+	EXPECT_EQ(dt::WORD, tokens[0].type);
+	EXPECT_EQ(3u, tokens[0].length);
+	EXPECT_EQ(dt::TEXT, tokens[1].type);
+	EXPECT_EQ(1u, tokens[1].length);
+	EXPECT_EQ(dt::WORD, tokens[2].type);
+	EXPECT_EQ(3u, tokens[2].length);
+}
+
+TEST(lagi_word_split, two_words_newline) {
+	std::string text = "abc\\Ndef";
+	std::vector<DialogueToken> tokens = {
+		{dt::TEXT, 3},
+		{dt::LINE_BREAK, 2},
+		{dt::TEXT, 3}
+	};
+
+	SplitWords(text, tokens);
+	ASSERT_EQ(3u, tokens.size());
+	EXPECT_EQ(dt::WORD, tokens[0].type);
+	EXPECT_EQ(3u, tokens[0].length);
+	EXPECT_EQ(dt::LINE_BREAK, tokens[1].type);
+	EXPECT_EQ(2u, tokens[1].length);
+	EXPECT_EQ(dt::WORD, tokens[2].type);
+	EXPECT_EQ(3u, tokens[2].length);
+}
+
+TEST(lagi_word_split, two_words_unicode) {
+	std::string text = u8"abc\u300adef";
+	std::vector<DialogueToken> tokens = {{dt::TEXT, 9}};
+
+	SplitWords(text, tokens);
+	ASSERT_EQ(3u, tokens.size());
+	EXPECT_EQ(dt::WORD, tokens[0].type);
+	EXPECT_EQ(3u, tokens[0].length);
+	EXPECT_EQ(dt::TEXT, tokens[1].type);
+	EXPECT_EQ(3u, tokens[1].length);
+	EXPECT_EQ(dt::WORD, tokens[2].type);
+	EXPECT_EQ(3u, tokens[2].length);
+}
+
+TEST(lagi_word_split, drawing) {
+	std::string text = "a b{\\p1}m 10{\\p0}c";
+	std::vector<DialogueToken> tokens = {
+		{dt::TEXT, 3},
+		{dt::OVR_BEGIN, 1},
+		{dt::TAG_START, 1},
+		{dt::TAG_NAME, 1},
+		{dt::ARG, 1},
+		{dt::OVR_END, 1},
+		{dt::TEXT, 4},
+		{dt::OVR_BEGIN, 1},
+		{dt::TAG_START, 1},
+		{dt::TAG_NAME, 1},
+		{dt::ARG, 1},
+		{dt::OVR_END, 1},
+		{dt::TEXT, 1}
+	};
+
+	SplitWords(text, tokens);
+
+	ASSERT_EQ(15u, tokens.size());
+	EXPECT_EQ(dt::WORD, tokens[0].type);
+	EXPECT_EQ(dt::WORD, tokens[2].type);
+	EXPECT_EQ(dt::WORD, tokens[14].type);
+
+	EXPECT_EQ(dt::DRAWING, tokens[8].type);
+}
+
+TEST(lagi_word_split, unclosed_ovr) {
+	std::string text = "a{\\b";
+	std::vector<DialogueToken> tokens = {
+		{dt::TEXT, 1},
+		{dt::OVR_BEGIN, 1},
+		{dt::TAG_START, 1},
+		{dt::TAG_NAME, 1}
+	};
+
+	SplitWords(text, tokens);
+	ASSERT_EQ(4u, tokens.size());
+	EXPECT_EQ(dt::WORD, tokens[0].type);
+	EXPECT_EQ(dt::TEXT, tokens[1].type);
+	EXPECT_EQ(dt::TEXT, tokens[2].type);
+	EXPECT_EQ(dt::WORD, tokens[3].type);
+}
+