[tz] zic tweak to warn about non-ASCII in filenames

Paul Eggert eggert at cs.ucla.edu
Thu Jun 26 06:22:12 UTC 2014


Thanks.  I pushed the attached patch, which started with your patch and 
incorporated suggestions from the followup comments.  This patch also 
modifies 'Theory' to match practice better.
-------------- next part --------------
diff --git a/NEWS b/NEWS
index 1ee5813..5e5ba8e 100644
--- a/NEWS
+++ b/NEWS
@@ -15,10 +15,17 @@ Unreleased, experimental changes
     Error diagnostics of 'zic' and 'yearistype' have been reworded so that
     they no longer use ASCII '-' as if it were a dash.
 
+    'zic -v' now warns about output file names that do not follow POSIX rules,
+    or that contain a digit or a file name component of '.' or '..'.
+    (Thanks to Arthur David Olson for starting the ball rolling on this.)
+
     Some lint has been removed when using GCC_DEBUG_FLAGS with GCC 4.9.0.
 
   Changes affecting documentation and commentary
 
+    The 'Theory' file documents the longstanding exceptions to the
+    POSIX file name rules that are in 'etcetera' and 'backward'.
+
     Documentation and commentary now prefer UTF-8 to US-ASCII,
     allowing the use of proper accents in foreign words and names.
     Code and data have not changed because of this.
diff --git a/Theory b/Theory
index ce43b60..c31731a 100644
--- a/Theory
+++ b/Theory
@@ -405,7 +405,8 @@ in decreasing order of importance:
 		digits, as that might create an ambiguity with POSIX
 		TZ strings.  A file name component must not exceed 14
 		characters or start with '-'.  E.g., prefer 'Brunei'
-		to 'Bandar_Seri_Begawan'.
+		to 'Bandar_Seri_Begawan'.  Exceptions: see the discussion
+		of the 'etcetera' file below.
 	A name must not be empty, or contain '//', or start or end with '/'.
 	Do not use names that differ only in case.  Although the reference
 		implementation is case-sensitive, some other implementations
@@ -464,11 +465,21 @@ longitude, this relationship is not exact.
 Older versions of this package used a different naming scheme,
 and these older names are still supported.
 See the file 'backward' for most of these older names
-(e.g. 'US/Eastern' instead of 'America/New_York');
-excluding 'backward' should not affect the other data.
+(e.g., 'US/Eastern' instead of 'America/New_York').
 The other old-fashioned names still supported are
 'WET', 'CET', 'MET', and 'EET' (see the file 'europe').
 
+Older versions of this package defined names that were
+incompatible with POSIX.  These older names are still supported,
+even though they do not conform to first rule of location names.
+These incompatible names are mostly defined in the file 'etcetera'.
+Also, the file 'backward' defines the incompatible names 'GMT0',
+'GMT-0', 'GMT+0', and 'Canada/East-Saskatchewan'.
+
+Excluding 'backward' should not affect the other data.  If
+'backward' is excluded, excluding 'etcetera' should not affect the
+remaining data.
+
 
 ----- Time zone abbreviations -----
 
diff --git a/zic.8 b/zic.8
index cfe0ad4..95dd038 100644
--- a/zic.8
+++ b/zic.8
@@ -112,6 +112,20 @@ before 1970 or after the start of 2038.
 .PP
 A time zone abbreviation has fewer than 3 characters.
 POSIX requires at least 3.
+.PP
+An output file name contains a byte that is not an ASCII letter, digit,
+.q "-" ,
+.q "." ,
+.q "/" ,
+or
+.q "_" ;
+or it contains a file name component that contains more than 14 bytes
+or that starts with
+.q "-"
+or is
+.q "."
+or
+.q ".." .
 .RE
 .TP
 .B \-s
diff --git a/zic.c b/zic.c
index 64d6781..62c5fd5 100644
--- a/zic.c
+++ b/zic.c
@@ -622,11 +622,58 @@ _("%s: More than one -L option specified\n"),
 }
 
 static void
+componentcheck(char const *name, char const *component,
+	       char const *component_end)
+{
+	enum { component_len_max = 14 };
+	size_t component_len = component_end - component;
+	if (0 < component_len && component[0] == '-')
+		warning(_("file name '%s' component contains leading '-'"),
+			name);
+	if (0 < component_len && component_len <= 2
+	    && component[0] == '.' && component_end[-1] == '.')
+		warning(_("file name '%s' contains '%.*s' component"),
+			name, (int) component_len, component);
+	if (component_len_max < component_len)
+		warning(_("file name '%s' contains overlength component"
+			  " '%.*s...'"),
+			name, component_len_max, component);
+}
+
+static void
+namecheck(const char *name)
+{
+	register char const *cp;
+	static char const benign[] = ("-./_"
+				      "abcdefghijklmnopqrstuvwxyz"
+				      "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+	register char const *component = name;
+	if (!noise)
+		return;
+	for (cp = name; *cp; cp++) {
+		unsigned char c = *cp;
+		if (!strchr(benign, c)) {
+			warning((isascii(c) && isprint(c)
+				 ? _("file name '%s' contains byte '%c'")
+				 : _("file name '%s' contains byte '\\%o'")),
+				name, c);
+			return;
+		}
+		if (c == '/') {
+			componentcheck(name, component, cp);
+			component = cp + 1;
+		}
+	}
+	componentcheck(name, component, cp);
+}
+
+static void
 dolink(const char *const fromfield, const char *const tofield)
 {
 	register char *	fromname;
 	register char *	toname;
 
+	namecheck(tofield);
 	if (fromfield[0] == '/')
 		fromname = ecpyalloc(fromfield);
 	else {
@@ -1495,6 +1542,7 @@ writezone(const char *const name, const char *const string, char version)
 	void *typesptr = ats + timecnt;
 	unsigned char *types = typesptr;
 
+	namecheck(name);
 	/*
 	** Sort.
 	*/


More information about the tz mailing list