From b8dbc70a9c1b9160c07e696b3a64655abc6b7d9d Mon Sep 17 00:00:00 2001
From: Přemysl Eric Janouch <p@janouch.name>
Date: Sat, 28 Aug 2021 14:44:38 +0200
Subject: xC: respect text formatting when autosplitting

---
 NEWS |   5 +++
 xC.c | 157 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 133 insertions(+), 29 deletions(-)

diff --git a/NEWS b/NEWS
index a037978..9edef71 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,8 @@
+1.4.0 (xxxx-xx-xx)
+
+ * xC: made message autosplitting respect text formatting
+
+
 1.3.0 (2021-08-07) "New World Order"
 
  * xC: made nick autocompletion offer recent speakers first
diff --git a/xC.c b/xC.c
index 438cf59..2b2bf73 100644
--- a/xC.c
+++ b/xC.c
@@ -2797,7 +2797,8 @@ enum
 	TEXT_UNDERLINE   = 1 << 2,
 	TEXT_INVERSE     = 1 << 3,
 	TEXT_BLINK       = 1 << 4,
-	TEXT_CROSSED_OUT = 1 << 5
+	TEXT_CROSSED_OUT = 1 << 5,
+	TEXT_MONOSPACE   = 1 << 6
 };
 
 struct attr_printer
@@ -8227,12 +8228,99 @@ irc_process_message (const struct irc_message *msg, struct server *s)
 
 // --- Message autosplitting magic ---------------------------------------------
 
-// This is the most basic acceptable algorithm; something like ICU with proper
+// This is a rather basic algorithm; something like ICU with proper
 // locale specification would be needed to make it work better.
 
+struct irc_char_attrs
+{
+	uint8_t fg, bg;                     ///< {Fore,back}ground colour or 99
+	uint8_t attributes;                 ///< TEXT_* flags, except TEXT_BLINK
+	uint8_t starts_at_boundary;         ///< Possible to split here?
+};
+
+static void
+irc_serialize_char_attrs (const struct irc_char_attrs *attrs, struct str *out)
+{
+	soft_assert (attrs->fg < 100 && attrs->bg < 100);
+
+	if (attrs->fg != 99 || attrs->bg != 99)
+	{
+		str_append_printf (out, "\x03%u", attrs->fg);
+		if (attrs->bg != 99)
+			str_append_printf (out, ",%02u", attrs->bg);
+	}
+	if (attrs->attributes & TEXT_BOLD)        str_append_c (out, '\x02');
+	if (attrs->attributes & TEXT_ITALIC)      str_append_c (out, '\x1d');
+	if (attrs->attributes & TEXT_UNDERLINE)   str_append_c (out, '\x1f');
+	if (attrs->attributes & TEXT_INVERSE)     str_append_c (out, '\x16');
+	if (attrs->attributes & TEXT_CROSSED_OUT) str_append_c (out, '\x1e');
+	if (attrs->attributes & TEXT_MONOSPACE)   str_append_c (out, '\x11');
+}
+
+static const char *
+irc_analyze_mirc_color (const char *s, uint8_t *fg, uint8_t *bg)
+{
+	if (!isdigit_ascii (*s))
+	{
+		*fg = *bg = 99;
+		return s;
+	}
+
+	*fg = *s++ - '0';
+	if (isdigit_ascii (*s))
+		*fg = *fg * 10 + (*s++ - '0');
+
+	if (*s != ',' || !isdigit_ascii (s[1]))
+		return s;
+	s++;
+
+	*bg = *s++ - '0';
+	if (isdigit_ascii (*s))
+		*bg = *bg * 10 + (*s++ - '0');
+	return s;
+}
+
+// The text needs to be NUL-terminated
+// TODO: try to deduplicate analogous code in formatter_parse_mirc()
+static struct irc_char_attrs *
+irc_analyze_text (const char *text, size_t len)
+{
+	struct irc_char_attrs *attrs = xcalloc (len, sizeof *attrs),
+		blank = { .fg = 99, .bg = 99, .starts_at_boundary = true },
+		next = blank, cur = next;
+
+	for (size_t i = 0; i != len; cur = next)
+	{
+		const char *start = text;
+		hard_assert (utf8_decode (&text, len - i) >= 0);
+		switch (*start)
+		{
+		case '\x02': next.attributes ^= TEXT_BOLD;        break;
+		case '\x11': next.attributes ^= TEXT_MONOSPACE;   break;
+		case '\x1d': next.attributes ^= TEXT_ITALIC;      break;
+		case '\x1e': next.attributes ^= TEXT_CROSSED_OUT; break;
+		case '\x1f': next.attributes ^= TEXT_UNDERLINE;   break;
+		case '\x16': next.attributes ^= TEXT_INVERSE;     break;
+
+		case '\x03':
+			text = irc_analyze_mirc_color (text, &next.fg, &next.bg);
+			break;
+		case '\x0f':
+			next = blank;
+		}
+
+		while (start++ != text)
+		{
+			attrs[i++] = cur;
+			cur.starts_at_boundary = false;
+		}
+	}
+	return attrs;
+}
+
 static size_t
-wrap_text_for_single_line (const char *text, size_t text_len,
-	size_t line_len, struct str *output)
+wrap_text_for_single_line (const char *text, struct irc_char_attrs *attrs,
+	size_t text_len, size_t target_len, struct str *output)
 {
 	size_t eaten = 0;
 
@@ -8240,7 +8328,7 @@ wrap_text_for_single_line (const char *text, size_t text_len,
 	const char *word_start;
 	const char *word_end = text + strcspn (text, " ");
 	size_t word_len = word_end - text;
-	while (line_len && word_len <= line_len)
+	while (target_len && word_len <= target_len)
 	{
 		if (word_len)
 		{
@@ -8248,7 +8336,7 @@ wrap_text_for_single_line (const char *text, size_t text_len,
 
 			text += word_len;
 			eaten += word_len;
-			line_len -= word_len;
+			target_len -= word_len;
 		}
 
 		// Find the next word's end
@@ -8262,53 +8350,62 @@ wrap_text_for_single_line (const char *text, size_t text_len,
 		return eaten + (word_start - text);
 
 	// And if that doesn't help, cut the longest valid block of characters
-	for (const char *p = text; (size_t) (p - text) <= line_len; )
-	{
-		eaten = p - text;
-		hard_assert (utf8_decode (&p, text_len - eaten) >= 0);
-	}
+	for (size_t i = 1; i <= text_len && i <= target_len; i++)
+		if (i == text_len || attrs[i].starts_at_boundary)
+			eaten = i;
+
 	str_append_data (output, text, eaten);
 	return eaten;
 }
 
 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
+// In practice, this should never fail at all, although it's not guaranteed
 static bool
 wrap_message (const char *message,
 	int line_max, struct strv *output, struct error **e)
 {
+	size_t message_left = strlen (message), i = 0;
+	struct irc_char_attrs *attrs = irc_analyze_text (message, message_left);
+	struct str m = str_make ();
 	if (line_max <= 0)
 		goto error;
 
-	int message_left = strlen (message);
-	while (message_left > line_max)
+	while (m.len + message_left > (size_t) line_max)
 	{
-		struct str m = str_make ();
-
 		size_t eaten = wrap_text_for_single_line
-			(message, message_left, line_max, &m);
+			(message + i, attrs + i, message_left, line_max - m.len, &m);
 		if (!eaten)
-		{
-			str_free (&m);
 			goto error;
-		}
 
 		strv_append_owned (output, str_steal (&m));
-		message += eaten;
-		message_left -= eaten;
-	}
+		m = str_make ();
+
+		i += eaten;
+		if (!(message_left -= eaten))
+			break;
 
+		irc_serialize_char_attrs (attrs + i, &m);
+		if (m.len >= (size_t) line_max)
+		{
+			print_debug ("formatting continuation too long");
+			str_reset (&m);
+		}
+	}
 	if (message_left)
-		strv_append (output, message);
+		strv_append_owned (output,
+			xstrdup_printf ("%s%s", m.str, message + i));
 
+	free (attrs);
+	str_free (&m);
 	return true;
 
 error:
-	// Well, that's just weird
-	error_set (e,
+	free (attrs);
+	str_free (&m);
+	return error_set (e,
 		"Message splitting was unsuccessful as there was "
 		"too little room for UTF-8 characters");
-	return false;
 }
 
 /// Automatically splits messages that arrive at other clients with our prefix
@@ -14303,9 +14400,11 @@ test_aliases (void)
 static void
 test_wrapping (void)
 {
-	static const char *message = " foo bar foobar fóóbárbáz";
-	static const char *split[] =
-		{ " foo", "bar", "foob", "ar", "fó", "ób", "árb", "áz" };
+	static const char *message = " foo bar foobar fóóbárbáz\002 a\0031 b";
+	// XXX: formatting continuation order is implementation-dependent here
+	//   (irc_serialize_char_attrs() makes a choice in serialization)
+	static const char *split[] = { " foo", "bar", "foob", "ar",
+		"fó", "ób", "árb", "áz\x02", "\002a\0031", "\0031\002b" };
 
 	struct strv v = strv_make ();
 	hard_assert (wrap_message (message, 4, &v, NULL));
-- 
cgit v1.3.1