degesch: better & working text wrapping

Now we respect word boundaries.
author: Přemysl Janouch <p.janouch@gmail.com> 2015-04-24 22:34:44 +0200
committer: Přemysl Janouch <p.janouch@gmail.com> 2015-04-24 22:34:44 +0200
commit: d4413627e68325e954406e35c76b51325bb0e5d1 (patch)
tree: 570d246e533ca088f5bfd4a19787174ce822e6b8 /degesch.c
parent: 2a0dcc2addc2c949fa090f1e7f1e0de4f58cccb9 (diff)
download: xK-d4413627e68325e954406e35c76b51325bb0e5d1.tar.gz
xK-d4413627e68325e954406e35c76b51325bb0e5d1.tar.xz
xK-d4413627e68325e954406e35c76b51325bb0e5d1.zip
1 files changed, 49 insertions, 14 deletions
diff --git a/degesch.c b/degesch.c
index 962b587..1808de2 100644
--- a/degesch.c
+++ b/degesch.c
@@ -2398,15 +2398,15 @@ irc_process_message (const struct irc_message *msg,
 
 // --- Message autosplitting magic ---------------------------------------------
 
+// This is the most basic acceptable algorithm; something like ICU with proper
+// locale specification would be needed to make it work better.
+
 static bool
 wrap_text (const char *message,
 	int line_max, struct str_vector *output, struct error **e)
 {
-	// Attempt to split the message if it doesn't completely fit into a single
-	// IRC protocol message while trying not to break UTF-8.  Unicode can still
-	// end up being wrong, though.  As well as any mIRC formatting.
-	//
-	// TODO: at least try to word-wrap if nothing else
+	// Initialize to the first word, even if it's empty
+	const char *word_end = message + strcspn (message, " ");
 
 	for (int message_left = strlen (message); message_left; )
 	{
@@ -2415,12 +2415,46 @@ wrap_text (const char *message,
 
 		int part_left = MIN (line_max, message_left);
 		bool empty = true;
+
+		// First try going word by word
+		const char *word_start;
+		int word_len = word_end - message;
+		while (part_left && word_len <= part_left)
+		{
+			if (word_len)
+			{
+				str_append_data (&m, message, word_len);
+				message += word_len;
+				message_left -= word_len;
+				part_left -= word_len;
+				empty = false;
+			}
+
+			// Find the next word's end
+			word_start = message + strspn (message, " ");
+			word_end = word_start + strcspn (word_start, " ");
+			word_len = word_end - message;
+		}
+
+		if (!empty)
+		{
+			// Discard whitespace between words if split
+			message_left -= word_start - message;
+			message = word_start;
+
+			str_vector_add (output, m.str);
+			str_free (&m);
+			continue;
+		}
+
+		// And if that doesn't help, cut the longest valid block of characters.
+		// Note that we never get to the end of the word, so "word_end" stays.
 		while (true)
 		{
 			const char *next = utf8_next (message, message_left);
 			hard_assert (next);
 
-			int char_len = message - next;
+			int char_len = next - message;
 			if (char_len > part_left)
 				break;
 
@@ -2428,6 +2462,7 @@ wrap_text (const char *message,
 
 			message += char_len;
 			message_left -= char_len;
+			part_left -= char_len;
 			empty = false;
 		}
 
@@ -2436,14 +2471,14 @@ wrap_text (const char *message,
 
 		str_free (&m);
 
-		if (empty)
-		{
-			// Well, that's just weird
-			error_set (e,
-				"Message splitting was unsuccessful as there was "
-				"too little room for UTF-8 characters");
-			return false;
-		}
+		if (!empty)
+			continue;
+
+		// Well, that's just weird
+		error_set (e,
+			"Message splitting was unsuccessful as there was "
+			"too little room for UTF-8 characters");
+		return false;
 	}
 	return true;
 }
author	Přemysl Janouch <p.janouch@gmail.com>	2015-04-24 22:34:44 +0200
committer	Přemysl Janouch <p.janouch@gmail.com>	2015-04-24 22:34:44 +0200
commit	d4413627e68325e954406e35c76b51325bb0e5d1 (patch)
tree	570d246e533ca088f5bfd4a19787174ce822e6b8 /degesch.c
parent	2a0dcc2addc2c949fa090f1e7f1e0de4f58cccb9 (diff)
download	xK-d4413627e68325e954406e35c76b51325bb0e5d1.tar.gz xK-d4413627e68325e954406e35c76b51325bb0e5d1.tar.xz xK-d4413627e68325e954406e35c76b51325bb0e5d1.zip