21st century lam(1)

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

21st century lam(1)

Ingo Schwarze
Hi,

some time ago, i went on a rampage to improve UTF-8 support in
userland utilities.  But there are still a few stragglers among the
more obscure beasts out there.  For example, have you met lam(1)?

When LC_CTYPE=en_US.UTF-8 is set, let's fix the column alignment
in the presence of width 0 and width 2 UTF-8 characters, of course
without dying from invalid bytes that may also be around.

Following the pattern we discovered in the past, this utility once
again needs its own multibyte char utility function, different from
the ones needed in ls(1), ps(1), and rs(1).  But as usual, this one
is quite simple, too.  All the same, i'd rather keep it in its own
file and not encumber the main code with it.

Of course, now that we measure widths with wcwidth(3), printf(%*.*s)
is no longer sufficient to do truncation and alignment; instead,
we have to do truncation and alignment explicitly, taking the
various character display widths into account.  But doing that
is not too complicated.

OK?
  Ingo


Index: Makefile
===================================================================
RCS file: /cvs/src/usr.bin/lam/Makefile,v
retrieving revision 1.3
diff -u -p -r1.3 Makefile
--- Makefile 21 Sep 1997 11:49:24 -0000 1.3
+++ Makefile 11 Jul 2018 21:02:28 -0000
@@ -1,5 +1,6 @@
 # $OpenBSD: Makefile,v 1.3 1997/09/21 11:49:24 deraadt Exp $
 
 PROG= lam
+SRCS= lam.c utf8.c
 
 .include <bsd.prog.mk>
Index: lam.1
===================================================================
RCS file: /cvs/src/usr.bin/lam/lam.1,v
retrieving revision 1.9
diff -u -p -r1.9 lam.1
--- lam.1 4 Jan 2016 23:21:28 -0000 1.9
+++ lam.1 11 Jul 2018 21:02:28 -0000
@@ -74,8 +74,8 @@ is the minimum field width and
 the maximum field width.
 If
 .Ar min
-begins with a zero, zeros will be added to make up the field width,
-and if it begins with a
+begins with a zero, zeros will be prepended to make up the field width
+instead of blanks, and if it begins with a
 .Sq \&- ,
 the fragment will be left-adjusted
 within the field.
@@ -98,6 +98,22 @@ The newline normally appended to each ou
 .Pp
 To print files simultaneously for easy viewing use
 .Xr pr 1 .
+.Sh ENVIRONMENT
+.Bl -tag -width LC_CTYPE
+.It Ev LC_CTYPE
+The character encoding
+.Xr locale 1 .
+It determines the display widths of characters used by the
+.Fl f
+and
+.Fl p
+options.
+If unset or set to
+.Qq C ,
+.Qq POSIX ,
+or an unsupported value, each byte is regarded as a character
+of display width 1.
+.El
 .Sh EXAMPLES
 Join four files together along each line:
 .Pp
Index: lam.c
===================================================================
RCS file: /cvs/src/usr.bin/lam/lam.c,v
retrieving revision 1.21
diff -u -p -r1.21 lam.c
--- lam.c 11 Jul 2018 11:42:17 -0000 1.21
+++ lam.c 11 Jul 2018 21:02:28 -0000
@@ -39,6 +39,7 @@
 
 #include <ctype.h>
 #include <err.h>
+#include <locale.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -48,11 +49,13 @@
 
 struct openfile { /* open file structure */
  FILE *fp; /* file pointer */
+ int minwidth; /* pad this column to this width */
+ int maxwidth; /* truncate this column */
  short eof; /* eof flag */
  short pad; /* pad flag for missing columns */
  char eol; /* end of line character */
+ char align; /* '0' for zero fill, '-' for left align */
  char *sepstring; /* string to print before each line */
- char *format; /* printf(3) style string spec. */
 } input[NOFILE_MAX + 1]; /* last one is for the last -s arg. */
 #define INPUTSIZE sizeof(input) / sizeof(*input)
 
@@ -61,6 +64,8 @@ int nofinalnl; /* normally append \n to
 char line[BIGBUFSIZ];
 char *linep;
 
+int mbswidth_truncate(char *, int);  /* utf8.c */
+
 void usage(void);
 char *gatherline(struct openfile *);
 void getargs(int, char *[]);
@@ -71,6 +76,8 @@ main(int argc, char *argv[])
 {
  int i;
 
+ setlocale(LC_CTYPE, "");
+
  if (pledge("stdio rpath", NULL) == -1)
  err(1, "pledge");
 
@@ -106,9 +113,9 @@ void
 getargs(int argc, char *argv[])
 {
  struct openfile *ip = input;
- char *p;
+ const char *errstr;
+ char *p, *q;
  int ch, P, S, F, T;
- size_t siz;
 
  P = S = F = T = 0; /* capitalized options */
  while (optind < argc) {
@@ -120,17 +127,28 @@ getargs(int argc, char *argv[])
  case 'F': case 'f':
  F = (ch == 'F');
  /* Validate format string argument. */
- for (p = optarg; *p != '\0'; p++)
- if (!isdigit((unsigned char)*p) &&
-    *p != '.' && *p != '-')
- errx(1, "%s: invalid width specified",
-     optarg);
- /* '%' + width + 's' + '\0' */
- siz = p - optarg + 3;
- if ((p = realloc(ip->format, siz)) == NULL)
- err(1, NULL);
- snprintf(p, siz, "%%%ss", optarg);
- ip->format = p;
+ p = optarg;
+ if (*p == '0' || *p == '-')
+ ip->align = *p++;
+ else
+ ip->align = ' ';
+ if ((q = strchr(p, '.')) != NULL)
+ *q++ = '\0';
+ if (*p != '\0') {
+ ip->minwidth = strtonum(p, 1, INT_MAX,
+    &errstr);
+ if (errstr != NULL)
+ errx(1, "minimum width is %s: %s",
+    errstr, p);
+ }
+ if (q != NULL) {
+ ip->maxwidth = strtonum(q, 1, INT_MAX,
+    &errstr);
+ if (errstr != NULL)
+ errx(1, "maximum width is %s: %s",
+    errstr, q);
+ } else
+ ip->maxwidth = INT_MAX;
  break;
  case 'S': case 's':
  S = (ch == 'S');
@@ -157,10 +175,16 @@ getargs(int argc, char *argv[])
  ip->pad = P;
  if (ip->sepstring == NULL)
  ip->sepstring = S ? (ip-1)->sepstring : "";
- if (ip->format == NULL)
- ip->format = (P || F) ? (ip-1)->format : "%s";
  if (ip->eol == '\0')
  ip->eol = T ? (ip-1)->eol : '\n';
+ if (ip->align == '\0') {
+ if (F || P) {
+ ip->align = (ip-1)->align;
+ ip->minwidth = (ip-1)->minwidth;
+ ip->maxwidth = (ip-1)->maxwidth;
+ } else
+ ip->maxwidth = INT_MAX;
+ }
  ip++;
  optind++;
  break;
@@ -179,14 +203,14 @@ pad(struct openfile *ip)
 {
  size_t n;
  char *lp = linep;
+ int i = 0;
 
  n = strlcpy(lp, ip->sepstring,  line + sizeof(line) - lp);
  lp += (n < line + sizeof(line) - lp) ? n : strlen(lp);
- if (ip->pad) {
- n = snprintf(lp, line + sizeof(line) - lp, ip->format, "");
- if (n > 0)
- lp += (n < line + sizeof(line) - lp) ? n : strlen(lp);
- }
+ if (ip->pad)
+ while (i++ < ip->minwidth && lp + 1 < line + sizeof(line))
+ *lp++ = ' ';
+ *lp = '\0';
  return (lp);
 }
 
@@ -202,7 +226,7 @@ gatherline(struct openfile *ip)
  char *p;
  char *lp = linep;
  char *end = s + BUFSIZ - 1;
- int c;
+ int c, width;
 
  if (ip->eof)
  return (pad(ip));
@@ -220,9 +244,16 @@ gatherline(struct openfile *ip)
  numfiles++;
  n = strlcpy(lp, ip->sepstring, line + sizeof(line) - lp);
  lp += (n < line + sizeof(line) - lp) ? n : strlen(lp);
- n = snprintf(lp, line + sizeof(line) - lp, ip->format, s);
- if (n > 0)
- lp += (n < line + sizeof(line) - lp) ? n : strlen(lp);
+ width = mbswidth_truncate(s, ip->maxwidth);
+ if (ip->align != '-')
+ while (width++ < ip->minwidth && lp + 1 < line + sizeof(line))
+ *lp++ = ip->align;
+ n = strlcpy(lp, s, line + sizeof(line) - lp);
+ lp += (n < line + sizeof(line) - lp) ? n : strlen(lp);
+ if (ip->align == '-')
+ while (width++ < ip->minwidth && lp + 1 < line + sizeof(line))
+ *lp++ = ' ';
+ *lp = '\0';
  return (lp);
 }
 
Index: utf8.c
===================================================================
RCS file: utf8.c
diff -N utf8.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ utf8.c 11 Jul 2018 21:02:28 -0000
@@ -0,0 +1,47 @@
+/* $OpenBSD$ */
+/*
+ * Copyright (c) 2018 Ingo Schwarze <[hidden email]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <wchar.h>
+
+/*
+ * Measure the display width of the multibyte string.
+ * Treat invalid bytes and non-printable characters as width 1.
+ * Truncate the string to a display width of maxwidth.
+ * Return the total width, possibly after truncation.
+ */
+int
+mbswidth_truncate(char *mbs, int maxwidth)
+{
+ wchar_t wc;
+ int len, width, sum;
+
+ sum = 0;
+ while (*mbs != '\0') {
+ if ((len = mbtowc(&wc, mbs, MB_CUR_MAX)) == -1)
+ len = width = 1;
+ else if ((width = wcwidth(wc)) < 0)
+ width = 1;
+ if (sum + width > maxwidth) {
+ *mbs = '\0';
+ break;
+ }
+ sum += width;
+ mbs += len;
+ }
+ return sum;
+}