[tz] [PROPOSED] Support zi parsers that mishandle negative DST offsets

Tue Jan 30 08:49:12 UTC 2018

This is intended to provide a way to support both clients that require
data to have only positive DST offsets, and clients that do not have
this restriction.
* Makefile (XDST, SDST): New macros.
(TZDATA_ZI_DEPS): Add zidst.awk.
(DSTDATA_ZI_DEPS): New macro.
(all): Depend on fulldata.zi and pdstdata.zi.
(fulldata.zi pdstdata.zi): New rule.
(tzdata.zi): Use $(XDST)data.zi instead of reading original source.
(check_zishrink): Check zidst.awk, too.
(clean): Remove all *.zi files, not just tzdata.zi.
* NEWS, europe: Mention this.
* zidst.awk: New file.
---
 Makefile  | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
 NEWS      | 30 ++++++++++++++++++++++++++++++
 europe    | 39 ++++++++++++++++++++++-----------------
 zidst.awk | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 154 insertions(+), 28 deletions(-)
 create mode 100644 zidst.awk

diff --git a/Makefile b/Makefile
index 8c84cd9..92ddb80 100644
--- a/Makefile
+++ b/Makefile
@@ -10,6 +10,26 @@ VERSION=	unknown
 # Email address for bug reports.
 BUGEMAIL=	tz at iana.org
 
+# To install the full data, which can contain daylight saving time
+# offsets that are negative (relative to standard time), use
+#	XDST=	full
+# To install data containing only positive daylight saving time
+# offsets, but otherwise as close to the full data as practical, use
+#	XDST=	pdst
+XDST=		pdst
+# Parsers requiring DST offsets to be positive should use the file
+# pdstdata.zi, which contains almost all the data of 'africa' etc.,
+# except with positive DST offsets.  This works around a problem that
+# was discovered in January 2018 with negative DST in tests for ICU
+# and OpenJDK.  See:
+# https://mm.icann.org/pipermail/tz/2018-January/025825.html
+# https://mm.icann.org/pipermail/tz/2018-January/025822.html
+# Currently the 'africa' etc. files use pdst form if comments are
+# ignored, to ease transition for parsers that do not support
+# negative DST offsets.  This is intended to change to full form at
+# some point, so that full-featured zi parsers that use the 'africa'
+# files will get the full data without changing anything.
+
 # Change the line below for your time zone (after finding the zone you want in
 # the time zone files, or adding it to a time zone file).
 # Alternately, if you discover you've got the wrong time zone, you can just
@@ -463,7 +483,8 @@ TDATA=		$(YDATA) $(NDATA) $(BACKWARD)
 ZONETABLES=	zone1970.tab zone.tab
 TABDATA=	iso3166.tab $(TZDATA_TEXT) $(ZONETABLES)
 LEAP_DEPS=	leapseconds.awk leap-seconds.list
-TZDATA_ZI_DEPS=	zishrink.awk version $(TDATA) $(PACKRATDATA)
+TZDATA_ZI_DEPS=	zidst.awk zishrink.awk version $(TDATA) $(PACKRATDATA)
+DSTDATA_ZI_DEPS= zidst.awk $(TDATA) $(PACKRATDATA)
 DATA=		$(TDATA_TO_CHECK) backzone iso3166.tab leap-seconds.list \
 			leapseconds yearistype.sh $(ZONETABLES)
 AWK_SCRIPTS=	checklinks.awk checktab.awk leapseconds.awk zishrink.awk
@@ -500,7 +521,8 @@ VERSION_DEPS= \
 
 SHELL=		/bin/sh
 
-all:		tzselect yearistype zic zdump libtz.a $(TABDATA)
+all:		tzselect yearistype zic zdump libtz.a $(TABDATA) \
+		  fulldata.zi pdstdata.zi
 
 ALL:		all date $(ENCHILADA)
 
@@ -535,11 +557,15 @@ version:	$(VERSION_DEPS)
 		printf '%s\n' "$$V" >$@.out
 		mv $@.out $@
 
-# This file can be tailored by setting BACKWARD, PACKRATDATA, etc.
-tzdata.zi:	$(TZDATA_ZI_DEPS)
+# These files can be tailored by setting BACKWARD, PACKRATDATA, etc.
+fulldata.zi pdstdata.zi: $(DSTDATA_ZI_DEPS)
+		$(AWK) -v outfile='$@' -f zidst.awk $(TDATA) $(PACKRATDATA) \
+		  >$@.out
+		mv $@.out $@
+tzdata.zi:	$(XDST)data.zi version
 		version=`sed 1q version` && \
 		  LC_ALL=C $(AWK) -v version="$$version" -f zishrink.awk \
-		    $(TDATA) $(PACKRATDATA) >$@.out
+		    $(XDST)data.zi >$@.out
 		mv $@.out $@
 
 version.h:	version
@@ -721,17 +747,32 @@ check_tzs:	$(TZS) $(TZS_NEW)
 check_web:	tz-how-to.html
 		$(VALIDATE_ENV) $(VALIDATE) $(VALIDATE_FLAGS) tz-how-to.html
 
-# Check that tzdata.zi generates the same binary data that its sources do.
-check_zishrink: tzdata.zi zic leapseconds $(PACKRATDATA) $(TDATA)
+# The format of the source files, either full or pdst.
+# Currently they are in pdst format, but this is expected to change.
+SDST = pdst
+
+# Check that zishrink.awk does not alter the data, and that zidst.awk
+# preserves $(SDST) data.
+check_zishrink: zic leapseconds $(PACKRATDATA) $(TDATA) \
+  $(XDST)data.zi tzdata.zi
 		for type in posix right; do \
-		  mkdir -p time_t.dir/$$type time_t.dir/$$type-shrunk && \
+		  mkdir -p time_t.dir/$$type time_t.dir/$$type-$(SDST) \
+		    time_t.dir/$$type-shrunk && \
 		  case $$type in \
 		    right) leap='-L leapseconds';; \
 	            *) leap=;; \
 		  esac && \
-		  $(ZIC) $$leap -d time_t.dir/$$type $(TDATA) && \
-		  $(AWK) '/^Rule/' $(TDATA) | \
+		  $(ZIC) $$leap -d time_t.dir/$$type $(XDST)data.zi && \
+		  $(AWK) '/^Rule/' $(XDST)data.zi | \
 		    $(ZIC) $$leap -d time_t.dir/$$type - $(PACKRATDATA) && \
+		  case $(XDST) in \
+		    $(SDST)) \
+		      $(ZIC) $$leap -d time_t.dir/$$type-$(SDST) $(TDATA) && \
+		      $(AWK) '/^Rule/' $(TDATA) | \
+			$(ZIC) $$leap -d time_t.dir/$$type-$(SDST) \
+			  $(XDST)data.zi && \
+		      diff -r time_t.dir/$$type time_t.dir/$$type-$(SDST);; \
+		  esac && \
 		  $(ZIC) $$leap -d time_t.dir/$$type-shrunk tzdata.zi && \
 		  diff -r time_t.dir/$$type time_t.dir/$$type-shrunk || exit; \
 		done
@@ -741,7 +782,7 @@ clean_misc:
 		rm -f core *.o *.out \
 		  date tzselect version.h zdump zic yearistype libtz.a
 clean:		clean_misc
-		rm -fr *.dir tzdata.zi tzdb-*/ $(TZS_NEW)
+		rm -fr *.dir *.zi tzdb-*/ $(TZS_NEW)
 
 maintainer-clean: clean
 		@echo 'This command is intended for maintainers to use; it'
diff --git a/NEWS b/NEWS
index 4f763c0..c455f3c 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,36 @@ News for the tz database
 
 Unreleased, experimental changes
 
+  Briefly:
+  Support zi parsers that mishandle negative DST offsets
+
+  Changes to build procedure
+
+    The new XDST macro in the Makefile lets the installer choose
+    XDST=full, which allows arbitrary DST offsets in the data, or
+    XDST=pdst, which allows only positive DST offsets.  Choosing
+    XDST=full is arguably more correct for Ireland, which observes
+    Irish Standard Time (IST, UTC+01) in summer and GMT (UTC) in
+    winter.  Choosing XDST=pdst is better for zoneinfo parsers that do
+    not work well with negative DST offsets, notably OpenJDK+CLDR.
+    On platforms using tzcode or similar APIs, XDST should not affect
+    any behavior other than that depending on the tm_isdst flag.
+
+    For now this change does not affect client-visible behavior by
+    default, as the Makefile defaults to XDST=pdst and uncommented
+    parts of the data source files contain only pdst-format data.
+    After a bit of time for testing, XDST=full and full-format source
+    files are planned to become the default, so that parsers that
+    support negative DST offsets can get full data without changing
+    their build procedures.  Parsers requiring positive DST offsets
+    should use the new file pdstdata.zi instead of tzdata.zi or the
+    source files 'africa' etc.: pdstdata.zi is pdst-compatible, it is
+    automatically built from the data source files, and it will
+    continue to be pdst-compatible regardless of XDST.  To get
+    full-format data now, use the new file fulldata.zi, which will
+    continue to be full-format regardless of XDST.  To get the format
+    selected by XDST, use tzdata.zi.
+
   Changes to code
 
     The code is a bit more portable to MS-Windows.  (Thanks to Manuela
diff --git a/europe b/europe
index 6c1ccbe..5aeda33 100644
--- a/europe
+++ b/europe
@@ -508,11 +508,27 @@ Link	Europe/London	Europe/Jersey
 Link	Europe/London	Europe/Guernsey
 Link	Europe/London	Europe/Isle_of_Man
 
-# From Paul Eggert (2018-01-19):
+# From Paul Eggert (2018-01-30):
+# In January 2018 we discovered that the negative DST offsets in the
+# Eire rules cause problems with tests for ICU:
+# https://mm.icann.org/pipermail/tz/2018-January/025825.html
+# and with tests for OpenJDK:
+# https://mm.icann.org/pipermail/tz/2018-January/025822.html
+# To work around this problem, zidst.awk translates the following data
+# lines into two forms.  First, fulldata.zi contains the full data,
+# which includes negative DST offsets.  Second, pdstdata.zi uses a
+# traditional approximation for Irish time stamps after 1971-10-31
+# 02:00 UTC; although this approximation has tm_isdst flags that are
+# the reverse of the full data, its UTC offsets are correct and this
+# suffices for ICU and OpenJDK.  Although this source file currently
+# has pdstdata.zi lines active and fulldata.zi lines commented out,
+# this is intended to change in the near future and downstream code
+# should not rely on it.
+#
 # The following is like GB-Eire and EU, except with standard time in
 # summer and negative daylight saving time in winter.
-# Although currently commented out, this will need to become uncommented
-# once the ICU/OpenJDK workaround is removed; see below.
+# This rule set is active in fulldata.zi and is commented out in
+# pdstdata.zi.
 # Rule	NAME	FROM	TO	TYPE	IN	ON	AT	SAVE	LETTER/S
 #Rule	Eire	1971	only	-	Oct	31	 2:00u	-1:00	GMT
 #Rule	Eire	1972	1980	-	Mar	Sun>=16	 2:00u	0	IST
@@ -533,24 +549,13 @@ Zone	Europe/Dublin	-0:25:00 -	LMT	1880 Aug  2
 			 0:00	1:00	IST	1947 Nov  2  2:00s
 			 0:00	-	GMT	1948 Apr 18  2:00s
 			 0:00	GB-Eire	GMT/IST	1968 Oct 27
-# From Paul Eggert (2018-01-18):
-# The next line should look like this:
+# The next line is active in fulldata.zi and commented out in pdstdata.zi.
 #			 1:00	Eire	IST/GMT
-# However, in January 2018 we discovered that the Eire rules cause
-# problems with tests for ICU:
-# https://mm.icann.org/pipermail/tz/2018-January/025825.html
-# and with tests for OpenJDK:
-# https://mm.icann.org/pipermail/tz/2018-January/025822.html
-# To work around this problem, use a traditional approximation for
-# time stamps after 1971-10-31 02:00 UTC, to give ICU and OpenJDK
-# developers breathing room to fix bugs.  This approximation has
-# correct UTC offsets, but results in tm_isdst flags are the reverse
-# of what they should be.  This workaround is temporary and should be
-# removed reasonably soon.
+# These three lines are active in pdstdata.zi and commented out in
+# fulldata.zi.
 			 1:00	-	IST	1971 Oct 31  2:00u
 			 0:00	GB-Eire	GMT/IST	1996
 			 0:00	EU	GMT/IST
-# End of workaround for ICU and OpenJDK bugs.
 
 
 ###############################################################################
diff --git a/zidst.awk b/zidst.awk
new file mode 100644
index 0000000..7885e9a
--- /dev/null
+++ b/zidst.awk
@@ -0,0 +1,50 @@
+# Convert tzdata source into full or positive-DST form
+
+# Contributed by Paul Eggert.  This file is in the public domain.
+
+# This is not a general-purpose converter; it is designed for current tzdata.
+#
+# When converting to full form, the output can use negative DST offsets.
+#
+# When converting to positive-DST form, the output uses only positive
+# DST offsets.  The idea is for the output data to simulate the
+# behavior of the input data as best it can within the constraints of
+# positive DST offsets.
+#
+# In the input, lines requiring the full format are commented #[full]
+# and the positive DST near-equivalents are commented #[pdst].
+
+BEGIN {
+  dst_type["full"] = 1
+  dst_type["pdst"] = 1
+
+  # The command line should set OUTFILE to the name of the output file,
+  # which should start with either "full" or "pdst".
+  todst = substr(outfile, 1, 4)
+  if (!dst_type[todst]) exit 1
+}
+
+/^Zone/ { zone = $2 }
+
+{
+  in_comment = /^#/
+
+  # Test whether this line should differ between the full and the pdst versions.
+  Rule_Eire = /^#?Rule[\t ]+Eire[\t ]/
+  Zone_Dublin_post_1968 \
+    = (zone == "Europe/Dublin" && /^#?[\t ]+[01]:00[\t ]/ \
+       && (!$(in_comment + 4) || 1968 < $(in_comment + 4)))
+
+  # If so, uncomment the desired version and comment out the undesired one.
+  if (Rule_Eire || Zone_Dublin_post_1968) {
+    if ((Rule_Eire \
+	 || (Zone_Dublin_post_1968 && $(in_comment + 3) == "IST/GMT"))	\
+	== (todst == "full")) {
+      sub(/^#/, "")
+    } else if (/^[^#]/) {
+      sub(/^/, "#")
+    }
+  }
+}
+
+{ print }
-- 
2.14.3