[tz] [PROPOSED] Generalize data format upgrade procedure

Paul Eggert eggert at cs.ucla.edu
Mon Feb 5 05:27:19 UTC 2018


* Makefile (XDST): Remove, replacing with the more-general ...
(DATAFORM): ... new macro.  All uses changed.
(fulldata.zi, pdstdata.zi): Remove, replacing with the more-general ...
(vanguard.zi, main.zi, rearguard.zi): ... new targets.
All uses changed.
(check_zishrink): Fix bug that caused this test to fail when
PACKRATDATA was nonempty and testing vanguard or rearguard data.
* NEWS, europe: Mention changes.
* ziguard.awk: Handle fractional seconds as well as negative DST offsets.
Rename from zidst.awk, since it now handles issues other than just DST.
All uses changed.
---
 Makefile    | 69 ++++++++++++++++++-----------------------------
 NEWS        | 64 ++++++++++++++++++++++++++------------------
 europe      | 29 +++++++++-----------
 zidst.awk   | 50 ----------------------------------
 ziguard.awk | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 166 insertions(+), 135 deletions(-)
 delete mode 100644 zidst.awk
 create mode 100644 ziguard.awk

diff --git a/Makefile b/Makefile
index 92ddb80..659a5b0 100644
--- a/Makefile
+++ b/Makefile
@@ -10,25 +10,14 @@ VERSION=	unknown
 # Email address for bug reports.
 BUGEMAIL=	tz at iana.org
 
-# To install the full data, which can contain daylight saving time
-# offsets that are negative (relative to standard time), use
-#	XDST=	full
-# To install data containing only positive daylight saving time
-# offsets, but otherwise as close to the full data as practical, use
-#	XDST=	pdst
-XDST=		pdst
-# Parsers requiring DST offsets to be positive should use the file
-# pdstdata.zi, which contains almost all the data of 'africa' etc.,
-# except with positive DST offsets.  This works around a problem that
-# was discovered in January 2018 with negative DST in tests for ICU
-# and OpenJDK.  See:
-# https://mm.icann.org/pipermail/tz/2018-January/025825.html
-# https://mm.icann.org/pipermail/tz/2018-January/025822.html
-# Currently the 'africa' etc. files use pdst form if comments are
-# ignored, to ease transition for parsers that do not support
-# negative DST offsets.  This is intended to change to full form at
-# some point, so that full-featured zi parsers that use the 'africa'
-# files will get the full data without changing anything.
+# Choose source data features.  To get new features right away, use:
+#	DATAFORM=	vanguard
+# To wait a while before using new features, to give downstream users
+# time to upgrade zic (the default), use:
+#	DATAFORM=	main
+# To wait even longer for new features, use:
+#	DATAFORM=	rearguard
+DATAFORM=		main
 
 # Change the line below for your time zone (after finding the zone you want in
 # the time zone files, or adding it to a time zone file).
@@ -483,8 +472,8 @@ TDATA=		$(YDATA) $(NDATA) $(BACKWARD)
 ZONETABLES=	zone1970.tab zone.tab
 TABDATA=	iso3166.tab $(TZDATA_TEXT) $(ZONETABLES)
 LEAP_DEPS=	leapseconds.awk leap-seconds.list
-TZDATA_ZI_DEPS=	zidst.awk zishrink.awk version $(TDATA) $(PACKRATDATA)
-DSTDATA_ZI_DEPS= zidst.awk $(TDATA) $(PACKRATDATA)
+TZDATA_ZI_DEPS=	ziguard.awk zishrink.awk version $(TDATA) $(PACKRATDATA)
+DSTDATA_ZI_DEPS= ziguard.awk $(TDATA) $(PACKRATDATA)
 DATA=		$(TDATA_TO_CHECK) backzone iso3166.tab leap-seconds.list \
 			leapseconds yearistype.sh $(ZONETABLES)
 AWK_SCRIPTS=	checklinks.awk checktab.awk leapseconds.awk zishrink.awk
@@ -522,7 +511,7 @@ VERSION_DEPS= \
 SHELL=		/bin/sh
 
 all:		tzselect yearistype zic zdump libtz.a $(TABDATA) \
-		  fulldata.zi pdstdata.zi
+		  vanguard.zi main.zi rearguard.zi
 
 ALL:		all date $(ENCHILADA)
 
@@ -558,14 +547,14 @@ version:	$(VERSION_DEPS)
 		mv $@.out $@
 
 # These files can be tailored by setting BACKWARD, PACKRATDATA, etc.
-fulldata.zi pdstdata.zi: $(DSTDATA_ZI_DEPS)
-		$(AWK) -v outfile='$@' -f zidst.awk $(TDATA) $(PACKRATDATA) \
+vanguard.zi main.zi rearguard.zi: $(DSTDATA_ZI_DEPS)
+		$(AWK) -v outfile='$@' -f ziguard.awk $(TDATA) $(PACKRATDATA) \
 		  >$@.out
 		mv $@.out $@
-tzdata.zi:	$(XDST)data.zi version
+tzdata.zi:	$(DATAFORM).zi version
 		version=`sed 1q version` && \
 		  LC_ALL=C $(AWK) -v version="$$version" -f zishrink.awk \
-		    $(XDST)data.zi >$@.out
+		    $(DATAFORM).zi >$@.out
 		mv $@.out $@
 
 version.h:	version
@@ -747,31 +736,25 @@ check_tzs:	$(TZS) $(TZS_NEW)
 check_web:	tz-how-to.html
 		$(VALIDATE_ENV) $(VALIDATE) $(VALIDATE_FLAGS) tz-how-to.html
 
-# The format of the source files, either full or pdst.
-# Currently they are in pdst format, but this is expected to change.
-SDST = pdst
-
-# Check that zishrink.awk does not alter the data, and that zidst.awk
-# preserves $(SDST) data.
+# Check that zishrink.awk does not alter the data, and that ziguard.awk
+# preserves main-format data.
 check_zishrink: zic leapseconds $(PACKRATDATA) $(TDATA) \
-  $(XDST)data.zi tzdata.zi
+		  $(DATAFORM).zi tzdata.zi
 		for type in posix right; do \
-		  mkdir -p time_t.dir/$$type time_t.dir/$$type-$(SDST) \
+		  mkdir -p time_t.dir/$$type time_t.dir/$$type-t \
 		    time_t.dir/$$type-shrunk && \
 		  case $$type in \
 		    right) leap='-L leapseconds';; \
 	            *) leap=;; \
 		  esac && \
-		  $(ZIC) $$leap -d time_t.dir/$$type $(XDST)data.zi && \
-		  $(AWK) '/^Rule/' $(XDST)data.zi | \
-		    $(ZIC) $$leap -d time_t.dir/$$type - $(PACKRATDATA) && \
-		  case $(XDST) in \
-		    $(SDST)) \
-		      $(ZIC) $$leap -d time_t.dir/$$type-$(SDST) $(TDATA) && \
+		  $(ZIC) $$leap -d time_t.dir/$$type $(DATAFORM).zi && \
+		  case $(DATAFORM) in \
+		    main) \
+		      $(ZIC) $$leap -d time_t.dir/$$type-t $(TDATA) && \
 		      $(AWK) '/^Rule/' $(TDATA) | \
-			$(ZIC) $$leap -d time_t.dir/$$type-$(SDST) \
-			  $(XDST)data.zi && \
-		      diff -r time_t.dir/$$type time_t.dir/$$type-$(SDST);; \
+			$(ZIC) $$leap -d time_t.dir/$$type-t - \
+			  $(PACKRATDATA) && \
+		      diff -r time_t.dir/$$type time_t.dir/$$type-t;; \
 		  esac && \
 		  $(ZIC) $$leap -d time_t.dir/$$type-shrunk tzdata.zi && \
 		  diff -r time_t.dir/$$type time_t.dir/$$type-shrunk || exit; \
diff --git a/NEWS b/NEWS
index adc9814..b13c356 100644
--- a/NEWS
+++ b/NEWS
@@ -3,8 +3,8 @@ News for the tz database
 Unreleased, experimental changes
 
   Briefly:
-  Support zi parsers that mishandle negative DST offsets
-  Add fractional seconds to source data format.
+  Add support for vanguard and rearguard data consumers.
+  Add fractional seconds to source data format and to vanguard data.
 
   Changes to past time stamps
 
@@ -16,30 +16,41 @@ Unreleased, experimental changes
 
   Changes to build procedure
 
-    The new XDST macro in the Makefile lets the installer choose
-    XDST=full, which allows arbitrary DST offsets in the data, or
-    XDST=pdst, which allows only positive DST offsets.  Choosing
-    XDST=full is arguably more correct for Ireland, which observes
-    Irish Standard Time (IST, UTC+01) in summer and GMT (UTC) in
-    winter.  Choosing XDST=pdst is better for zoneinfo parsers that do
-    not work well with negative DST offsets, notably OpenJDK+CLDR.
-    On platforms using tzcode or similar APIs, XDST should not affect
-    any behavior other than that depending on the tm_isdst flag.
-
-    For now this change does not affect client-visible behavior by
-    default, as the Makefile defaults to XDST=pdst and uncommented
-    parts of the data source files contain only pdst-format data.
-    After a bit of time for testing, XDST=full and full-format source
-    files are planned to become the default, so that parsers that
-    support negative DST offsets can get full data without changing
-    their build procedures.  Parsers requiring positive DST offsets
-    should use the new file pdstdata.zi instead of tzdata.zi or the
-    source files 'africa' etc.: pdstdata.zi is pdst-compatible, it is
-    automatically built from the data source files, and it will
-    continue to be pdst-compatible regardless of XDST.  To get
-    full-format data now, use the new file fulldata.zi, which will
-    continue to be full-format regardless of XDST.  To get the format
-    selected by XDST, use tzdata.zi.
+    The new DATAFORM macro in the Makefile lets the installer choose
+    among three source data formats.  The idea is to lessen downstream
+    disruption when data formats are improved.
+
+    * DATAFORM=vanguard installs from the latest, bleeding-edge
+      format.  DATAFORM=main (the default) installs from the format
+      used in the 'africa' etc. files.  DATAFORM=rearguard installs
+      from a trailing-edge format.  Eventually, elements of today's
+      vanguard format should move to the main format, and similarly
+      the main format's features should eventually move to the
+      rearguard format.
+
+    * In the current version, the main and rearguard formats are
+      identical and match that of 2018c, so this change does not
+      affect default behavior.  The vanguard format contains two
+      features not in the main format: fractional seconds and negative
+      DST offsets.  Fractional seconds were added in this release,
+      where they affect only zic input (output is unaffected).
+      Negative DST offsets improve support for Ireland, which uses
+      Irish Standard Time (IST, UTC+01) in summer and GMT (UTC) in
+      winter.  tzcode has supported negative DST offsets for decades,
+      and this feature should move to the main format soon.  However,
+      it will not move to the rearguard format for quite some time
+      because some downstream parsers do not support it.
+
+    * The build procedure constructs three files vanguard.zi, main.zi,
+      and rearguard.zi, one for each format.  The files represent the
+      same data as closely as the formats allow.  These three files
+      are intended for downstream data consumers and are not
+      installed.  Zoneinfo parsers that require positive DST offsets
+      should start using rearguard.zi, so that they will be unaffected
+      when the negative-DST feature moves from vanguard to main.
+      Bleeding-edge Zoneinfo parsers that support the new features
+      already can use vanguard.zi; in this respect, current tzcode is
+      bleeding-edge.
 
   Changes to code
 
@@ -48,6 +59,7 @@ Unreleased, experimental changes
     zic currently rounds these fractions to the nearest integer
     (breaking ties to the even integer), the fractions may be useful
     to applications requiring more precision in historical timestamps.
+    This extension is currently used only in vanguard.zi.
 
     The code is a bit more portable to MS-Windows.  (Thanks to Manuela
     Friedrich).
diff --git a/europe b/europe
index 76cbb5d..8aab26e 100644
--- a/europe
+++ b/europe
@@ -514,21 +514,19 @@ Link	Europe/London	Europe/Isle_of_Man
 # https://mm.icann.org/pipermail/tz/2018-January/025825.html
 # and with tests for OpenJDK:
 # https://mm.icann.org/pipermail/tz/2018-January/025822.html
-# To work around this problem, zidst.awk translates the following data
-# lines into two forms.  First, fulldata.zi contains the full data,
-# which includes negative DST offsets.  Second, pdstdata.zi uses a
-# traditional approximation for Irish time stamps after 1971-10-31
-# 02:00 UTC; although this approximation has tm_isdst flags that are
-# the reverse of the full data, its UTC offsets are correct and this
-# suffices for ICU and OpenJDK.  Although this source file currently
-# has pdstdata.zi lines active and fulldata.zi lines commented out,
-# this is intended to change in the near future and downstream code
-# should not rely on it.
+#
+# To work around this problem, the build procedure can translate the
+# following data into two forms, one with negative DST offsets and the
+# other form with a traditional approximation for Irish time stamps
+# after 1971-10-31 02:00 UTC; although this approximation has tm_isdst
+# flags that are reversed, its UTC offsets are correct and this often
+# suffices.  This source file currently uses only positive DST
+# offsets, but this is intended to change and downstream code should
+# not rely on it.
 #
 # The following is like GB-Eire and EU, except with standard time in
-# summer and negative daylight saving time in winter.
-# This rule set is active in fulldata.zi and is commented out in
-# pdstdata.zi.
+# summer and negative daylight saving time in winter.  It is for when
+# negative DST offsets are used.
 # Rule	NAME	FROM	TO	TYPE	IN	ON	AT	SAVE	LETTER/S
 #Rule	Eire	1971	only	-	Oct	31	 2:00u	-1:00	GMT
 #Rule	Eire	1972	1980	-	Mar	Sun>=16	 2:00u	0	IST
@@ -549,10 +547,9 @@ Zone	Europe/Dublin	-0:25:00 -	LMT	1880 Aug  2
 			 0:00	1:00	IST	1947 Nov  2  2:00s
 			 0:00	-	GMT	1948 Apr 18  2:00s
 			 0:00	GB-Eire	GMT/IST	1968 Oct 27
-# The next line is active in fulldata.zi and commented out in pdstdata.zi.
+# The next line is for when negative DST offsets are used.
 #			 1:00	Eire	IST/GMT
-# These three lines are active in pdstdata.zi and commented out in
-# fulldata.zi.
+# These three lines are for when positive DST offsets are used.
 			 1:00	-	IST	1971 Oct 31  2:00u
 			 0:00	GB-Eire	GMT/IST	1996
 			 0:00	EU	GMT/IST
diff --git a/zidst.awk b/zidst.awk
deleted file mode 100644
index 7885e9a..0000000
--- a/zidst.awk
+++ /dev/null
@@ -1,50 +0,0 @@
-# Convert tzdata source into full or positive-DST form
-
-# Contributed by Paul Eggert.  This file is in the public domain.
-
-# This is not a general-purpose converter; it is designed for current tzdata.
-#
-# When converting to full form, the output can use negative DST offsets.
-#
-# When converting to positive-DST form, the output uses only positive
-# DST offsets.  The idea is for the output data to simulate the
-# behavior of the input data as best it can within the constraints of
-# positive DST offsets.
-#
-# In the input, lines requiring the full format are commented #[full]
-# and the positive DST near-equivalents are commented #[pdst].
-
-BEGIN {
-  dst_type["full"] = 1
-  dst_type["pdst"] = 1
-
-  # The command line should set OUTFILE to the name of the output file,
-  # which should start with either "full" or "pdst".
-  todst = substr(outfile, 1, 4)
-  if (!dst_type[todst]) exit 1
-}
-
-/^Zone/ { zone = $2 }
-
-{
-  in_comment = /^#/
-
-  # Test whether this line should differ between the full and the pdst versions.
-  Rule_Eire = /^#?Rule[\t ]+Eire[\t ]/
-  Zone_Dublin_post_1968 \
-    = (zone == "Europe/Dublin" && /^#?[\t ]+[01]:00[\t ]/ \
-       && (!$(in_comment + 4) || 1968 < $(in_comment + 4)))
-
-  # If so, uncomment the desired version and comment out the undesired one.
-  if (Rule_Eire || Zone_Dublin_post_1968) {
-    if ((Rule_Eire \
-	 || (Zone_Dublin_post_1968 && $(in_comment + 3) == "IST/GMT"))	\
-	== (todst == "full")) {
-      sub(/^#/, "")
-    } else if (/^[^#]/) {
-      sub(/^/, "#")
-    }
-  }
-}
-
-{ print }
diff --git a/ziguard.awk b/ziguard.awk
new file mode 100644
index 0000000..795b4ef
--- /dev/null
+++ b/ziguard.awk
@@ -0,0 +1,89 @@
+# Convert tzdata source into vanguard or rearguard form.
+
+# Contributed by Paul Eggert.  This file is in the public domain.
+
+# This is not a general-purpose converter; it is designed for current tzdata.
+#
+# When converting to vanguard form, the output can use fractional seconds
+# and negative DST offsets.
+#
+# When converting to rearguard form, the output omits fractional
+# seconds and uses only positive DST offsets.  The idea is for the
+# output data to simulate the behavior of the input data as best it
+# can within the constraints of the rearguard format.
+
+BEGIN {
+  dst_type["vanguard.zi"] = 1
+  dst_type["main.zi"] = 1
+  dst_type["rearguard.zi"] = 1
+
+  # The command line should set OUTFILE to the name of the output file.
+  if (!dst_type[outfile]) exit 1
+  vanguard = outfile == "vanguard.zi"
+
+  # List non-integer standard times more accurately if known.
+  # This list does not attempt to record every UT offset that is
+  # not an integral multiple of 1 s; it merely records those that
+  # do not appear to be just LMT.
+  frac["-5:36:13"] = "-5:36:13.3" # America/Costa_Rica before 1921
+  frac["-5:07:10"] = "-5:07:10.41" # America/Jamaica before 1912
+  frac["-4:16:48"] = "-4:16:48.25" # America/Cordoba etc. 1894-1920
+  frac["-0:36:45"] = "-0:36:44.68" # Europe/Lisbon before 1912
+  frac["-0:25:21"] = "-0:25:21.1" # Europe/Dublin 1880-1916
+  frac["0:19:32"] = "0:19:32.13" # Europe/Amsterdam before 1937
+  frac["1:39:49"] = "1:39:49.2" # Europe/Helsinki before 1921
+  frac["2:05:09"] = "2:05:08.9" # Africa/Cairo before 1900
+  frac["4:37:11"] = "4:37:10.8" # Asia/Tashkent before 1924
+  frac["7:06:30"] = "7:06:30.1333" #... Asia/Ho_Chi_Minh 1906-1911
+  frac["7:07:12"] = "7:07:12.5" # Asia/Jakarta before 1923
+  frac["7:36:42"] = "7:36:41.7" # Asia/Hong_Kong before 1904
+  frac["8:05:43"] = "8:05:43.2" # Asia/Shanghai before 1901
+}
+
+/^Zone/ { zone = $2 }
+
+outfile != "main.zi" {
+  in_comment = /^#/
+
+  # If this line should differ due to Ireland using negative DST offsets,
+  # uncomment the desired version and comment out the undesired one.
+  Rule_Eire = /^#?Rule[\t ]+Eire[\t ]/
+  Zone_Dublin_post_1968 \
+    = (zone == "Europe/Dublin" && /^#?[\t ]+[01]:00[\t ]/ \
+       && (!$(in_comment + 4) || 1968 < $(in_comment + 4)))
+  if (Rule_Eire || Zone_Dublin_post_1968) {
+    if ((Rule_Eire \
+	 || (Zone_Dublin_post_1968 && $(in_comment + 3) == "IST/GMT"))	\
+	== vanguard) {
+      sub(/^#/, "")
+    } else if (/^[^#]/) {
+      sub(/^/, "#")
+    }
+  }
+
+  # Add or remove fractional seconds as needed.
+  f = $1 == "Zone" ? 3 : 1
+  for (rounded in frac) {
+    original = frac[rounded]
+    if ($f == rounded || $f == original) {
+      $f = vanguard ? original : rounded
+    }
+  }
+}
+
+# If a Link line is followed by a Zone line for the same data, comment
+# out the Link line.  This can happen if backzone overrides a Link
+# with a Zone.
+/^Link/ {
+  linkline[$3] = NR
+}
+/^Zone/ {
+  sub(/^Link/, "#Link", line[linkline[$2]])
+}
+
+{ line[NR] = $0 }
+
+END {
+  for (i = 1; i <= NR; i++)
+    print line[i]
+}
-- 
2.14.3



More information about the tz mailing list