[tz] [PATCH 2/2] Shrink tzdata.zi somewhat

Paul Eggert eggert at cs.ucla.edu
Fri Jun 2 20:03:05 UTC 2017


Without this change, tzdata source's 669742 bytes (180832 bytes
compressed) shrank to tzdata.zi's 123627 bytes (22247 bytes compressed).
With this change, tzdata.zi is 106273 bytes (21203 bytes compressed).
That is, the change's data compression ratio is about 1.16 (1.05 for
compressed data), and the total data compression ratio of tzdata.zi is
now about 6.3 (8.5 for compressed data).  These figures assume
lzip -9 compression.
* Makefile (tzdata.zi): Do not set the Awk PACKRATDATA var,
as zishrink.awk now handles duplicates directly.
* Makefile (zonenames, $(TZS_NEW)):
* checklinks.awk:
Work even when line codes are abbreviated.
* zishrink.awk (paw_through_packratdata): Remove; no longer needed.
Caller removed.
(gen_rule_name, output_saved_lines): New functions.
(process_input_line): Use it to abbreviate rule names.
Abbreviate line codes and "max" too.  Save output lines
instead of printing them immediately, so that later output
lines can supersede earlier.
(END): Output saved lines.
---
 Makefile       |  10 +++---
 checklinks.awk |   4 +--
 zishrink.awk   | 105 +++++++++++++++++++++++++++++++++++++++++++--------------
 3 files changed, 85 insertions(+), 34 deletions(-)

diff --git a/Makefile b/Makefile
index 3aa9e04..c3ef931 100644
--- a/Makefile
+++ b/Makefile
@@ -478,9 +478,7 @@ version:	$(VERSION_DEPS)
 
 # This file can be tailored by setting BACKWARD, PACKRATDATA, etc.
 tzdata.zi:	$(TZDATA_ZI_DEPS)
-		LC_ALL=C $(AWK) -v PACKRATDATA='$(PACKRATDATA)' \
-		  -f zishrink.awk \
-		  $(TDATA) $(PACKRATDATA) >$@.out
+		LC_ALL=C $(AWK) -f zishrink.awk $(TDATA) $(PACKRATDATA) >$@.out
 		mv $@.out $@
 
 version.h:	version
@@ -558,11 +556,11 @@ zones:		$(REDO)
 $(TZS_NEW):	tzdata.zi zdump zic
 		mkdir -p tzs.dir
 		$(zic) -d tzs.dir tzdata.zi
-		$(AWK) '/^Link/{print $$1 "\t" $$2 "\t" $$3}' \
+		$(AWK) '/^L/{print "Link\t" $$2 "\t" $$3}' \
 		   tzdata.zi | LC_ALL=C sort >$@.out
 		wd=`pwd` && \
 		zones=`$(AWK) -v wd="$$wd" \
-				'/^Zone/{print wd "/tzs.dir/" $$2}' tzdata.zi \
+				'/^Z/{print wd "/tzs.dir/" $$2}' tzdata.zi \
 			 | LC_ALL=C sort` && \
 		./zdump -i -c $(TZS_YEAR) $$zones >>$@.out
 		sed 's,^TZ=".*tzs\.dir/,TZ=",' $@.out >$@.sed.out
@@ -826,7 +824,7 @@ typecheck:
 		done
 
 zonenames:	tzdata.zi
-		@$(AWK) '/^Zone/ { print $$2 } /^Link/ { print $$3 }' tzdata.zi
+		@$(AWK) '/^Z/ { print $$2 } /^L/ { print $$3 }' tzdata.zi
 
 asctime.o:	private.h tzfile.h
 date.o:		private.h
diff --git a/checklinks.awk b/checklinks.awk
index 5b3e157..f309010 100644
--- a/checklinks.awk
+++ b/checklinks.awk
@@ -9,7 +9,7 @@ BEGIN {
     Zone = "\n"
 }
 
-/^Zone/ {
+/^Z/ {
     if (defined[$2]) {
 	if (defined[$2] == Zone) {
 	    printf "%s: Zone has duplicate definition\n", $2
@@ -21,7 +21,7 @@ BEGIN {
     defined[$2] = Zone
 }
 
-/^Link/ {
+/^L/ {
     if (defined[$3]) {
 	if (defined[$3] == Zone) {
 	    printf "%s: Link with same name as Zone\n", $3
diff --git a/zishrink.awk b/zishrink.awk
index 235b8f3..2c05a8d 100644
--- a/zishrink.awk
+++ b/zishrink.awk
@@ -6,33 +6,52 @@
 # 'zic' should treat this script's output as if it were identical to
 # this script's input.
 
-function paw_through_packratdata(line)
+
+# Return a new rule name.
+# N_RULE_NAMES keeps track of how many rule names have been generated.
+
+function gen_rule_name(alphabet, base, rule_name, n, digit)
 {
-  if (PACKRATDATA) {
-    while (0 < (getline line <PACKRATDATA)) {
-      if (split(line, field)) {
-	if (field[1] == "Zone") packrat_zone[field[2]] = 1
-	if (field[1] == "Link") packrat_zone[field[3]] = 1
-      }
-    }
-    close(PACKRATDATA)
-  }
+  alphabet = ""
+  alphabet = alphabet "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+  alphabet = alphabet "abcdefghijklmnopqrstuvwxyz"
+  alphabet = alphabet "!$%&'()*+,./:;<=>?@[\\]^_`{|}~"
+  base = length(alphabet)
+  rule_name = ""
+  n = n_rule_names++
+
+  do {
+    n -= rule_name && n <= base
+    digit = n % base
+    rule_name = substr(alphabet, digit + 1, 1) rule_name
+    n = (n - digit) / base
+  } while (n);
+
+  return rule_name
 }
 
-function process_input_line(line, field, end)
+# Process an input line and save it for later output.
+
+function process_input_line(line, field, end, i, n, startdef)
 {
   # Remove comments, normalize spaces, and append a space to each line.
   sub(/#.*/, "", line)
   line = line " "
   gsub(/[[:space:]]+/, " ", line)
 
+  # Abbreviate keywords.  Do not abbreviate "Link" to just "L",
+  # as pre-2017c zic erroneously diagnoses "Li" as ambiguous.
+  sub(/^Link /, "Li ", line)
+  sub(/^Rule /, "R ", line)
+  sub(/^Zone /, "Z ", line)
+
   # SystemV rules are not needed.
-  if (line ~ /^Rule SystemV /) next
+  if (line ~ /^R SystemV /) next
 
   # Replace FooAsia rules with the same rules without "Asia", as they
   # are duplicates.
   if (match(line, /[^ ]Asia /)) {
-    if (line ~ /^Rule /) next
+    if (line ~ /^R /) next
     line = substr(line, 1, RSTART) substr(line, RSTART + 5)
   }
 
@@ -53,7 +72,10 @@ function process_input_line(line, field, end)
     line = substr(line, 1, end - 3) substr(line, end - 1)
   }
 
-  # Abbreviate "only" and month names.
+  # Abbreviate "max", "only" and month names.
+  # Do not abbreviate "min", as pre-2017c zic erroneously diagnoses "mi"
+  # as ambiguous.
+  gsub(/ max /, " ma ", line)
   gsub(/ only /, " o ", line)
   gsub(/ Jan /, " Ja ", line)
   gsub(/ Feb /, " F ", line)
@@ -78,26 +100,57 @@ function process_input_line(line, field, end)
   # Remove unnecessary trailing " Ja" (for January).
   sub(/ Ja$/, "", line)
 
-  # Output lines unless they are later overridden in PACKRATDATA.
-  if (line ~ /^[LRZ]/) {
-    overridden = 0
-    if (FILENAME != PACKRATDATA) {
-      split(line, field)
-      if (field[1] == "Zone")
-	overridden = packrat_zone[field[2]]
-      else if (field[1] == "Link" && packrat_zone[field[3]])
-	next
+  n = split(line, field)
+
+  # Abbreviate rule names.
+  i = field[1] == "Z" ? 4 : field[1] == "Li" ? 0 : 2
+  if (i && field[i] ~ /^[^-+0-9]/) {
+    if (!rule[field[i]])
+      rule[field[i]] = gen_rule_name()
+    field[i] = rule[field[i]]
+  }
+
+  # If this zone supersedes an earlier one, delete the earlier one
+  # from the saved output lines.
+  startdef = ""
+  if (field[1] == "Z")
+    zonename = startdef = field[2]
+  else if (field[1] == "Li")
+    zonename = startdef = field[3]
+  else if (field[1] == "R")
+    zonename = ""
+  if (startdef) {
+    i = zonedef[startdef]
+    if (i) {
+      do
+	output_line[i - 1] = ""
+      while (output_line[i++] ~ /^[-+0-9]/);
     }
   }
-  if (!overridden)
-    print line
+  zonedef[zonename] = nout + 1
+
+  # Save the line for later output.
+  line = field[1]
+  for (i = 2; i <= n; i++)
+    line = line " " field[i]
+  output_line[nout++] = line
+}
+
+function output_saved_lines(i)
+{
+  for (i = 0; i < nout; i++)
+    if (output_line[i])
+      print output_line[i]
 }
 
 BEGIN {
   print "# This zic input file is in the public domain."
-  paw_through_packratdata()
 }
 
 /^[[:space:]]*[^#[:space:]]/ {
   process_input_line($0)
 }
+
+END {
+  output_saved_lines()
+}
-- 
2.9.4



More information about the tz mailing list