tsv_utils.common.utils source code

1 /**
2 Utilities used by tsv-utils applications. InputFieldReordering, BufferedOutputRange,
3 and a several others.
4 
5 Utilities in this file:
6 $(LIST
7     * [InputFieldReordering] - A class that creates a reordered subset of fields from
8       an input line. Fields in the subset are accessed by array indicies. This is
9       especially useful when processing the subset in a specific order, such as the
10       order listed on the command-line at run-time.
11 
12     * [BufferedOutputRange] - An OutputRange with an internal buffer used to buffer
13       output. Intended for use with stdout, it is a significant performance benefit.
14 
15     * [isFlushableOutputRange] - Tests if something is an OutputRange with a flush
16       member.
17 
18     * [bufferedByLine] - An input range that reads from a File handle line by line.
19       It is similar to the standard library method std.stdio.File.byLine, but quite a
20       bit faster. This is achieved by reading in larger blocks and buffering.
21 
22     * [InputSourceRange] - An input range that provides open file access to a set of
23       files. It is used to iterate over files passed as command line arguments. This
24       enable reading header line of a file during command line argument process, then
25       passing the open file to the main processing functions.
26 
27     * [ByLineSourceRange] - Similar to an InputSourceRange, except that it provides
28       access to a byLine iterator (bufferedByLine) rather than an open file. This is
29       used by tools that run the same processing logic both header non-header lines.
30 
31     * [joinAppend] - A function that performs a join, but appending the join output to
32       an output stream. It is a performance improvement over using join or joiner with
33       writeln.
34 
35     * [getTsvFieldValue] - A convenience function when only a single value is needed
36       from an input line.
37 
38     * [throwIfWindowsNewlineOnUnix] - A utility for Unix platform builds to detecting
39       Windows newlines in input.
40 )
41 
42 Copyright (c) 2015-2020, eBay Inc.
43 Initially written by Jon Degenhardt
44 
45 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
46 */
47 
48 module tsv_utils.common.utils;
49 
50 import std.range;
51 import std.traits : isIntegral, isSomeChar, isSomeString, isUnsigned, ReturnType;
52 import std.typecons : Flag, No, Yes;
53 
54 // InputFieldReording class.
55 
56 /** Flag used by the InputFieldReordering template. */
57 alias EnablePartialLines = Flag!"enablePartialLines";
58 
59 /**
60 InputFieldReordering - Move select fields from an input line to an output array,
61 reordering along the way.
62 
63 The InputFieldReordering class is used to reorder a subset of fields from an input line.
64 The caller instantiates an InputFieldReordering object at the start of input processing.
65 The instance contains a mapping from input index to output index, plus a buffer holding
66 the reordered fields. The caller processes each input line by calling initNewLine,
67 splitting the line into fields, and calling processNextField on each field. The output
68 buffer is ready when the allFieldsFilled method returns true.
69 
70 Fields are not copied, instead the output buffer points to the fields passed by the caller.
71 The caller needs to use or copy the output buffer while the fields are still valid, which
72 is normally until reading the next input line. The program below illustrates the basic use
73 case. It reads stdin and outputs fields [3, 0, 2], in that order. (See also joinAppend,
74 below, which has a performance improvement over join used here.)
75 
76 ---
77 int main(string[] args)
78 {
79     import tsv_utils.common.utils;
80     import std.algorithm, std.array, std.range, std.stdio;
81     size_t[] fieldIndicies = [3, 0, 2];
82     auto fieldReordering = new InputFieldReordering!char(fieldIndicies);
83     foreach (line; stdin.byLine)
84     {
85         fieldReordering.initNewLine;
86         foreach(fieldIndex, fieldValue; line.splitter('\t').enumerate)
87         {
88             fieldReordering.processNextField(fieldIndex, fieldValue);
89             if (fieldReordering.allFieldsFilled) break;
90         }
91         if (fieldReordering.allFieldsFilled)
92         {
93             writeln(fieldReordering.outputFields.join('\t'));
94         }
95         else
96         {
97             writeln("Error: Insufficient number of field on the line.");
98         }
99     }
100     return 0;
101 }
102 ---
103 
104 Field indicies are zero-based. An individual field can be listed multiple times. The
105 outputFields array is not valid until all the specified fields have been processed. The
106 allFieldsFilled method tests this. If a line does not have enough fields the outputFields
107 buffer cannot be used. For most TSV applications this is okay, as it means the line is
108 invalid and cannot be used. However, if partial lines are okay, the template can be
109 instantiated with EnablePartialLines.yes. This will ensure that any fields not filled-in
110 are empty strings in the outputFields return.
111 */
112 final class InputFieldReordering(C, EnablePartialLines partialLinesOk = EnablePartialLines.no)
113 if (isSomeChar!C)
114 {
115     /* Implementation: The class works by creating an array of tuples mapping the input
116      * field index to the location in the outputFields array. The 'fromToMap' array is
117      * sorted in input field order, enabling placement in the outputFields buffer during a
118      * pass over the input fields. The map is created by the constructor. An example:
119      *
120      *    inputFieldIndicies: [3, 0, 7, 7, 1, 0, 9]
121      *             fromToMap: [<0,1>, <0,5>, <1,4>, <3,0>, <7,2>, <7,3>, <9,6>]
122      *
123      * During processing of an a line, an array slice, mapStack, is used to track how
124      * much of the fromToMap remains to be processed.
125      */
126     import std.range;
127     import std.typecons : Tuple;
128 
129     alias TupleFromTo = Tuple!(size_t, "from", size_t, "to");
130 
131     private C[][] outputFieldsBuf;
132     private TupleFromTo[] fromToMap;
133     private TupleFromTo[] mapStack;
134 
135     final this(const ref size_t[] inputFieldIndicies, size_t start = 0) pure nothrow @safe
136     {
137         import std.algorithm : sort;
138 
139         outputFieldsBuf = new C[][](inputFieldIndicies.length);
140         fromToMap.reserve(inputFieldIndicies.length);
141 
142         foreach (to, from; inputFieldIndicies.enumerate(start))
143         {
144             fromToMap ~= TupleFromTo(from, to);
145         }
146 
147         sort(fromToMap);
148         initNewLine;
149     }
150 
151     /** initNewLine initializes the object for a new line. */
152     final void initNewLine() pure nothrow @safe
153     {
154         mapStack = fromToMap;
155         static if (partialLinesOk)
156         {
157             import std.algorithm : each;
158             outputFieldsBuf.each!((ref s) => s.length = 0);
159         }
160     }
161 
162     /** processNextField maps an input field to the correct locations in the
163      * outputFields array.
164      *
165      * processNextField should be called once for each field on the line, in the order
166      * found. The processing of the line can terminate once allFieldsFilled returns
167      * true.
168      *
169      * The return value is the number of output fields the input field maps to. Zero
170      * means the field is not mapped to the output fields array.
171      *
172      * If, prior to allFieldsProcessed returning true, any fields on the input line
173      * are not passed to processNextField, the caller should either ensure the fields
174      * are not part of the output fields or have partial lines enabled.
175      */
176     final size_t processNextField(size_t fieldIndex, C[] fieldValue) pure nothrow @safe @nogc
177     {
178         size_t numFilled = 0;
179         while (!mapStack.empty && fieldIndex == mapStack.front.from)
180         {
181             outputFieldsBuf[mapStack.front.to] = fieldValue;
182             mapStack.popFront;
183             numFilled++;
184         }
185         return numFilled;
186     }
187 
188     /** allFieldsFilled returned true if all fields expected have been processed. */
189     final bool allFieldsFilled() const pure nothrow @safe @nogc
190     {
191         return mapStack.empty;
192     }
193 
194     /** outputFields is the assembled output fields. Unless partial lines are enabled,
195      * it is only valid after allFieldsFilled is true.
196      */
197     final C[][] outputFields() pure nothrow @safe @nogc
198     {
199         return outputFieldsBuf[];
200     }
201 }
202 
203 // InputFieldReordering - Tests using different character types.
204 @safe unittest
205 {
206     import std.conv : to;
207 
208     auto inputLines = [["r1f0", "r1f1", "r1f2",   "r1f3"],
209                        ["r2f0", "abc",  "ÀBCßßZ", "ghi"],
210                        ["r3f0", "123",  "456",    "789"]];
211 
212     size_t[] fields_2_0 = [2, 0];
213 
214     auto expected_2_0 = [["r1f2",   "r1f0"],
215                          ["ÀBCßßZ", "r2f0"],
216                          ["456",    "r3f0"]];
217 
218     char[][][]  charExpected_2_0 = to!(char[][][])(expected_2_0);
219     wchar[][][] wcharExpected_2_0 = to!(wchar[][][])(expected_2_0);
220     dchar[][][] dcharExpected_2_0 = to!(dchar[][][])(expected_2_0);
221     dstring[][] dstringExpected_2_0 = to!(dstring[][])(expected_2_0);
222 
223     auto charIFR  = new InputFieldReordering!char(fields_2_0);
224     auto wcharIFR = new InputFieldReordering!wchar(fields_2_0);
225     auto dcharIFR = new InputFieldReordering!dchar(fields_2_0);
226 
227     foreach (lineIndex, line; inputLines)
228     {
229         charIFR.initNewLine;
230         wcharIFR.initNewLine;
231         dcharIFR.initNewLine;
232 
233         foreach (fieldIndex, fieldValue; line)
234         {
235             charIFR.processNextField(fieldIndex, to!(char[])(fieldValue));
236             wcharIFR.processNextField(fieldIndex, to!(wchar[])(fieldValue));
237             dcharIFR.processNextField(fieldIndex, to!(dchar[])(fieldValue));
238 
239             assert ((fieldIndex >= 2) == charIFR.allFieldsFilled);
240             assert ((fieldIndex >= 2) == wcharIFR.allFieldsFilled);
241             assert ((fieldIndex >= 2) == dcharIFR.allFieldsFilled);
242         }
243         assert(charIFR.allFieldsFilled);
244         assert(wcharIFR.allFieldsFilled);
245         assert(dcharIFR.allFieldsFilled);
246 
247         assert(charIFR.outputFields == charExpected_2_0[lineIndex]);
248         assert(wcharIFR.outputFields == wcharExpected_2_0[lineIndex]);
249         assert(dcharIFR.outputFields == dcharExpected_2_0[lineIndex]);
250     }
251 }
252 
253 // InputFieldReordering - Test of partial line support.
254 @safe unittest
255 {
256     import std.conv : to;
257 
258     auto inputLines = [["r1f0", "r1f1", "r1f2",   "r1f3"],
259                        ["r2f0", "abc",  "ÀBCßßZ", "ghi"],
260                        ["r3f0", "123",  "456",    "789"]];
261 
262     size_t[] fields_2_0 = [2, 0];
263 
264     // The expected states of the output field while each line and field are processed.
265     auto expectedBylineByfield_2_0 =
266         [
267             [["", "r1f0"], ["", "r1f0"], ["r1f2", "r1f0"],   ["r1f2", "r1f0"]],
268             [["", "r2f0"], ["", "r2f0"], ["ÀBCßßZ", "r2f0"], ["ÀBCßßZ", "r2f0"]],
269             [["", "r3f0"], ["", "r3f0"], ["456", "r3f0"],    ["456", "r3f0"]],
270         ];
271 
272     char[][][][]  charExpectedBylineByfield_2_0 = to!(char[][][][])(expectedBylineByfield_2_0);
273 
274     auto charIFR  = new InputFieldReordering!(char, EnablePartialLines.yes)(fields_2_0);
275 
276     foreach (lineIndex, line; inputLines)
277     {
278         charIFR.initNewLine;
279         foreach (fieldIndex, fieldValue; line)
280         {
281             charIFR.processNextField(fieldIndex, to!(char[])(fieldValue));
282             assert(charIFR.outputFields == charExpectedBylineByfield_2_0[lineIndex][fieldIndex]);
283         }
284     }
285 }
286 
287 // InputFieldReordering - Field combination tests.
288 @safe unittest
289 {
290     import std.conv : to;
291     import std.stdio;
292 
293     auto inputLines = [["00", "01", "02", "03"],
294                        ["10", "11", "12", "13"],
295                        ["20", "21", "22", "23"]];
296 
297     size_t[] fields_0 = [0];
298     size_t[] fields_3 = [3];
299     size_t[] fields_01 = [0, 1];
300     size_t[] fields_10 = [1, 0];
301     size_t[] fields_03 = [0, 3];
302     size_t[] fields_30 = [3, 0];
303     size_t[] fields_0123 = [0, 1, 2, 3];
304     size_t[] fields_3210 = [3, 2, 1, 0];
305     size_t[] fields_03001 = [0, 3, 0, 0, 1];
306 
307     auto expected_0 = to!(char[][][])([["00"],
308                                        ["10"],
309                                        ["20"]]);
310 
311     auto expected_3 = to!(char[][][])([["03"],
312                                        ["13"],
313                                        ["23"]]);
314 
315     auto expected_01 = to!(char[][][])([["00", "01"],
316                                         ["10", "11"],
317                                         ["20", "21"]]);
318 
319     auto expected_10 = to!(char[][][])([["01", "00"],
320                                         ["11", "10"],
321                                         ["21", "20"]]);
322 
323     auto expected_03 = to!(char[][][])([["00", "03"],
324                                         ["10", "13"],
325                                         ["20", "23"]]);
326 
327     auto expected_30 = to!(char[][][])([["03", "00"],
328                                         ["13", "10"],
329                                         ["23", "20"]]);
330 
331     auto expected_0123 = to!(char[][][])([["00", "01", "02", "03"],
332                                           ["10", "11", "12", "13"],
333                                           ["20", "21", "22", "23"]]);
334 
335     auto expected_3210 = to!(char[][][])([["03", "02", "01", "00"],
336                                           ["13", "12", "11", "10"],
337                                           ["23", "22", "21", "20"]]);
338 
339     auto expected_03001 = to!(char[][][])([["00", "03", "00", "00", "01"],
340                                            ["10", "13", "10", "10", "11"],
341                                            ["20", "23", "20", "20", "21"]]);
342 
343     auto ifr_0 = new InputFieldReordering!char(fields_0);
344     auto ifr_3 = new InputFieldReordering!char(fields_3);
345     auto ifr_01 = new InputFieldReordering!char(fields_01);
346     auto ifr_10 = new InputFieldReordering!char(fields_10);
347     auto ifr_03 = new InputFieldReordering!char(fields_03);
348     auto ifr_30 = new InputFieldReordering!char(fields_30);
349     auto ifr_0123 = new InputFieldReordering!char(fields_0123);
350     auto ifr_3210 = new InputFieldReordering!char(fields_3210);
351     auto ifr_03001 = new InputFieldReordering!char(fields_03001);
352 
353     foreach (lineIndex, line; inputLines)
354     {
355         ifr_0.initNewLine;
356         ifr_3.initNewLine;
357         ifr_01.initNewLine;
358         ifr_10.initNewLine;
359         ifr_03.initNewLine;
360         ifr_30.initNewLine;
361         ifr_0123.initNewLine;
362         ifr_3210.initNewLine;
363         ifr_03001.initNewLine;
364 
365         foreach (fieldIndex, fieldValue; line)
366         {
367             ifr_0.processNextField(fieldIndex, to!(char[])(fieldValue));
368             ifr_3.processNextField(fieldIndex, to!(char[])(fieldValue));
369             ifr_01.processNextField(fieldIndex, to!(char[])(fieldValue));
370             ifr_10.processNextField(fieldIndex, to!(char[])(fieldValue));
371             ifr_03.processNextField(fieldIndex, to!(char[])(fieldValue));
372             ifr_30.processNextField(fieldIndex, to!(char[])(fieldValue));
373             ifr_0123.processNextField(fieldIndex, to!(char[])(fieldValue));
374             ifr_3210.processNextField(fieldIndex, to!(char[])(fieldValue));
375             ifr_03001.processNextField(fieldIndex, to!(char[])(fieldValue));
376         }
377 
378         assert(ifr_0.outputFields == expected_0[lineIndex]);
379         assert(ifr_3.outputFields == expected_3[lineIndex]);
380         assert(ifr_01.outputFields == expected_01[lineIndex]);
381         assert(ifr_10.outputFields == expected_10[lineIndex]);
382         assert(ifr_03.outputFields == expected_03[lineIndex]);
383         assert(ifr_30.outputFields == expected_30[lineIndex]);
384         assert(ifr_0123.outputFields == expected_0123[lineIndex]);
385         assert(ifr_3210.outputFields == expected_3210[lineIndex]);
386         assert(ifr_03001.outputFields == expected_03001[lineIndex]);
387     }
388 }
389 
390 
391 import std.stdio : File, isFileHandle, KeepTerminator;
392 import std.range : isOutputRange;
393 import std.traits : Unqual;
394 
395 /**
396 BufferedOutputRange is a performance enhancement over writing directly to an output
397 stream. It holds a File open for write or an OutputRange. Ouput is accumulated in an
398 internal buffer and written to the output stream as a block.
399 
400 Writing to stdout is a key use case. BufferedOutputRange is often dramatically faster
401 than writing to stdout directly. This is especially noticable for outputs with short
402 lines, as it blocks many writes together in a single write.
403 
404 The internal buffer is written to the output stream after flushSize has been reached.
405 This is checked at newline boundaries, when appendln is called or when put is called
406 with a single newline character. Other writes check maxSize, which is used to avoid
407 runaway buffers.
408 
409 
410 BufferedOutputRange has a put method allowing it to be used a range. It has a number
411 of other methods providing additional control.
412 
413 $(LIST
414     * `this(outputStream [, flushSize, reserveSize, maxSize])` - Constructor. Takes the
415       output stream, e.g. stdout. Other arguments are optional, defaults normally suffice.
416 
417     * `append(stuff)` - Append to the internal buffer.
418 
419     * `appendln(stuff)` - Append to the internal buffer, followed by a newline. The buffer
420       is flushed to the output stream if is has reached flushSize.
421 
422     * `appendln()` - Append a newline to the internal buffer. The buffer is flushed to the
423       output stream if is has reached flushSize.
424 
425     * `joinAppend(inputRange, delim)` - An optimization of `append(inputRange.joiner(delim))`.
426       For reasons that are not clear, joiner is quite slow.
427 
428     * `flushIfFull()` - Flush the internal buffer to the output stream if flushSize has been
429       reached.
430 
431     * `flush()` - Write the internal buffer to the output stream.
432 
433     * `put(stuff)` - Appends to the internal buffer. Acts as `appendln()` if passed a single
434       newline character, '\n' or "\n".
435 )
436 
437 The internal buffer is automatically flushed when the BufferedOutputRange goes out of
438 scope.
439 */
440 struct BufferedOutputRange(OutputTarget)
441 if (isFileHandle!(Unqual!OutputTarget) || isOutputRange!(Unqual!OutputTarget, char))
442 {
443     import std.range : isOutputRange;
444     import std.array : appender;
445     import std.format : format;
446 
447     /* Identify the output element type. Only supporting char and ubyte for now. */
448     static if (isFileHandle!OutputTarget || isOutputRange!(OutputTarget, char))
449     {
450         alias C = char;
451     }
452     else static if (isOutputRange!(OutputTarget, ubyte))
453     {
454         alias C = ubyte;
455     }
456     else static assert(false);
457 
458     private enum defaultReserveSize = 11264;
459     private enum defaultFlushSize = 10240;
460     private enum defaultMaxSize = 4194304;
461 
462     private OutputTarget _outputTarget;
463     private auto _outputBuffer = appender!(C[]);
464     private immutable size_t _flushSize;
465     private immutable size_t _maxSize;
466 
467     this(OutputTarget outputTarget,
468          size_t flushSize = defaultFlushSize,
469          size_t reserveSize = defaultReserveSize,
470          size_t maxSize = defaultMaxSize)
471     {
472         assert(flushSize <= maxSize);
473 
474         _outputTarget = outputTarget;
475         _flushSize = flushSize;
476         _maxSize = (flushSize <= maxSize) ? maxSize : flushSize;
477         _outputBuffer.reserve(reserveSize);
478     }
479 
480     ~this()
481     {
482         flush();
483     }
484 
485     void flush()
486     {
487         static if (isFileHandle!OutputTarget) _outputTarget.write(_outputBuffer.data);
488         else _outputTarget.put(_outputBuffer.data);
489 
490         _outputBuffer.clear;
491     }
492 
493     bool flushIfFull()
494     {
495         bool isFull = _outputBuffer.data.length >= _flushSize;
496         if (isFull) flush();
497         return isFull;
498     }
499 
500     /* flushIfMaxSize is a safety check to avoid runaway buffer growth. */
501     void flushIfMaxSize()
502     {
503         if (_outputBuffer.data.length >= _maxSize) flush();
504     }
505 
506     /* maybeFlush is intended for the case where put is called with a trailing newline.
507      *
508      * Flushing occurs if the buffer has a trailing newline and has reached flush size.
509      * Flushing also occurs if the buffer has reached max size.
510      */
511     private bool maybeFlush()
512     {
513         immutable bool doFlush =
514             _outputBuffer.data.length >= _flushSize &&
515             (_outputBuffer.data[$-1] == '\n' || _outputBuffer.data.length >= _maxSize);
516 
517         if (doFlush) flush();
518         return doFlush;
519     }
520 
521 
522     private void appendRaw(T)(T stuff) pure
523     {
524         import std.range : rangePut = put;
525         rangePut(_outputBuffer, stuff);
526     }
527 
528     void append(T)(T stuff)
529     {
530         appendRaw(stuff);
531         maybeFlush();
532     }
533 
534     bool appendln()
535     {
536         appendRaw('\n');
537         return flushIfFull();
538     }
539 
540     bool appendln(T)(T stuff)
541     {
542         appendRaw(stuff);
543         return appendln();
544     }
545 
546     /* joinAppend is an optimization of append(inputRange.joiner(delimiter).
547      * This form is quite a bit faster, 40%+ on some benchmarks.
548      */
549     void joinAppend(InputRange, E)(InputRange inputRange, E delimiter)
550     if (isInputRange!InputRange &&
551         is(ElementType!InputRange : const C[]) &&
552         (is(E : const C[]) || is(E : const C)))
553     {
554         if (!inputRange.empty)
555         {
556             appendRaw(inputRange.front);
557             inputRange.popFront;
558         }
559         foreach (x; inputRange)
560         {
561             appendRaw(delimiter);
562             appendRaw(x);
563         }
564         flushIfMaxSize();
565     }
566 
567     /* Make this an output range. */
568     void put(T)(T stuff)
569     {
570         import std.traits;
571         import std.stdio;
572 
573         static if (isSomeChar!T)
574         {
575             if (stuff == '\n') appendln();
576             else appendRaw(stuff);
577         }
578         else static if (isSomeString!T)
579         {
580             if (stuff == "\n") appendln();
581             else append(stuff);
582         }
583         else append(stuff);
584     }
585 }
586 
587 // BufferedOutputRange.
588 unittest
589 {
590     import tsv_utils.common.unittest_utils;
591     import std.file : rmdirRecurse, readText;
592     import std.path : buildPath;
593 
594     auto testDir = makeUnittestTempDir("tsv_utils_buffered_output");
595     scope(exit) testDir.rmdirRecurse;
596 
597     import std.algorithm : map, joiner;
598     import std.range : iota;
599     import std.conv : to;
600 
601     /* Basic test. Note that exiting the scope triggers flush. */
602     string filepath1 = buildPath(testDir, "file1.txt");
603     {
604         import std.stdio : File;
605 
606         auto ostream = BufferedOutputRange!File(filepath1.File("w"));
607         ostream.append("file1: ");
608         ostream.append("abc");
609         ostream.append(["def", "ghi", "jkl"]);
610         ostream.appendln(100.to!string);
611         ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" "));
612         ostream.appendln();
613     }
614     assert(filepath1.readText == "file1: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n");
615 
616     /* Test with no reserve and no flush at every line. */
617     string filepath2 = buildPath(testDir, "file2.txt");
618     {
619         import std.stdio : File;
620 
621         auto ostream = BufferedOutputRange!File(filepath2.File("w"), 0, 0);
622         ostream.append("file2: ");
623         ostream.append("abc");
624         ostream.append(["def", "ghi", "jkl"]);
625         ostream.appendln("100");
626         ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" "));
627         ostream.appendln();
628     }
629     assert(filepath2.readText == "file2: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n");
630 
631     /* With a locking text writer. Requires version 2.078.0
632        See: https://issues.dlang.org/show_bug.cgi?id=9661
633      */
634     static if (__VERSION__ >= 2078)
635     {
636         string filepath3 = buildPath(testDir, "file3.txt");
637         {
638             import std.stdio : File;
639 
640             auto ltw = filepath3.File("w").lockingTextWriter;
641             {
642                 auto ostream = BufferedOutputRange!(typeof(ltw))(ltw);
643                 ostream.append("file3: ");
644                 ostream.append("abc");
645                 ostream.append(["def", "ghi", "jkl"]);
646                 ostream.appendln("100");
647                 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" "));
648                 ostream.appendln();
649             }
650         }
651         assert(filepath3.readText == "file3: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n");
652     }
653 
654     /* With an Appender. */
655     import std.array : appender;
656     auto app1 = appender!(char[]);
657     {
658         auto ostream = BufferedOutputRange!(typeof(app1))(app1);
659         ostream.append("appender1: ");
660         ostream.append("abc");
661         ostream.append(["def", "ghi", "jkl"]);
662         ostream.appendln("100");
663         ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" "));
664         ostream.appendln();
665     }
666     assert(app1.data == "appender1: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n");
667 
668     /* With an Appender, but checking flush boundaries. */
669     auto app2 = appender!(char[]);
670     {
671         auto ostream = BufferedOutputRange!(typeof(app2))(app2, 10, 0); // Flush if 10+
672         bool wasFlushed = false;
673 
674         assert(app2.data == "");
675 
676         ostream.append("12345678"); // Not flushed yet.
677         assert(app2.data == "");
678 
679         wasFlushed = ostream.appendln;  // Nineth char, not flushed yet.
680         assert(!wasFlushed);
681         assert(app2.data == "");
682 
683         wasFlushed = ostream.appendln;  // Tenth char, now flushed.
684         assert(wasFlushed);
685         assert(app2.data == "12345678\n\n");
686 
687         app2.clear;
688         assert(app2.data == "");
689 
690         ostream.append("12345678");
691 
692         wasFlushed = ostream.flushIfFull;
693         assert(!wasFlushed);
694         assert(app2.data == "");
695 
696         ostream.flush;
697         assert(app2.data == "12345678");
698 
699         app2.clear;
700         assert(app2.data == "");
701 
702         ostream.append("123456789012345");
703         assert(app2.data == "");
704     }
705     assert(app2.data == "123456789012345");
706 
707     /* Using joinAppend. */
708     auto app1b = appender!(char[]);
709     {
710         auto ostream = BufferedOutputRange!(typeof(app1b))(app1b);
711         ostream.append("appenderB: ");
712         ostream.joinAppend(["a", "bc", "def"], '-');
713         ostream.append(':');
714         ostream.joinAppend(["g", "hi", "jkl"], '-');
715         ostream.appendln("*100*");
716         ostream.joinAppend(iota(0, 6).map!(x => x.to!string), ' ');
717         ostream.append(' ');
718         ostream.joinAppend(iota(6, 10).map!(x => x.to!string), " ");
719         ostream.appendln();
720     }
721     assert(app1b.data == "appenderB: a-bc-def:g-hi-jkl*100*\n0 1 2 3 4 5 6 7 8 9\n",
722            "app1b.data: |" ~app1b.data ~ "|");
723 
724     /* Operating as an output range. When passed to a function as a ref, exiting
725      * the function does not flush. When passed as a value, it get flushed when
726      * the function returns. Also test both UCFS and non-UFCS styles.
727      */
728 
729     void outputStuffAsRef(T)(ref T range)
730     if (isOutputRange!(T, char))
731     {
732         range.put('1');
733         put(range, "23");
734         range.put('\n');
735         range.put(["5", "67"]);
736         put(range, iota(8, 10).map!(x => x.to!string));
737         put(range, "\n");
738     }
739 
740     void outputStuffAsVal(T)(T range)
741     if (isOutputRange!(T, char))
742     {
743         put(range, '1');
744         range.put("23");
745         put(range, '\n');
746         put(range, ["5", "67"]);
747         range.put(iota(8, 10).map!(x => x.to!string));
748         range.put("\n");
749     }
750 
751     auto app3 = appender!(char[]);
752     {
753         auto ostream = BufferedOutputRange!(typeof(app3))(app3, 12, 0);
754         outputStuffAsRef(ostream);
755         assert(app3.data == "", "app3.data: |" ~app3.data ~ "|");
756         outputStuffAsRef(ostream);
757         assert(app3.data == "123\n56789\n123\n", "app3.data: |" ~app3.data ~ "|");
758     }
759     assert(app3.data == "123\n56789\n123\n56789\n", "app3.data: |" ~app3.data ~ "|");
760 
761     auto app4 = appender!(char[]);
762     {
763         auto ostream = BufferedOutputRange!(typeof(app4))(app4, 12, 0);
764         outputStuffAsVal(ostream);
765         assert(app4.data == "123\n56789\n", "app4.data: |" ~app4.data ~ "|");
766         outputStuffAsVal(ostream);
767         assert(app4.data == "123\n56789\n123\n56789\n", "app4.data: |" ~app4.data ~ "|");
768     }
769     assert(app4.data == "123\n56789\n123\n56789\n", "app4.data: |" ~app4.data ~ "|");
770 
771     /* Test maxSize. */
772     auto app5 = appender!(char[]);
773     {
774         auto ostream = BufferedOutputRange!(typeof(app5))(app5, 5, 0, 10); // maxSize 10
775         assert(app5.data == "");
776 
777         ostream.append("1234567");  // Not flushed yet (no newline).
778         assert(app5.data == "");
779 
780         ostream.append("89012");    // Flushed by maxSize
781         assert(app5.data == "123456789012");
782 
783         ostream.put("1234567");     // Not flushed yet (no newline).
784         assert(app5.data == "123456789012");
785 
786         ostream.put("89012");       // Flushed by maxSize
787         assert(app5.data == "123456789012123456789012");
788 
789         ostream.joinAppend(["ab", "cd"], '-');        // Not flushed yet
790         ostream.joinAppend(["de", "gh", "ij"], '-');  // Flushed by maxSize
791         assert(app5.data == "123456789012123456789012ab-cdde-gh-ij");
792     }
793     assert(app5.data == "123456789012123456789012ab-cdde-gh-ij");
794 }
795 
796 /**
797 isFlushableOutputRange returns true if R is an output range with a flush member.
798 */
799 enum bool isFlushableOutputRange(R, E=char) = isOutputRange!(R, E)
800     && is(ReturnType!((R r) => r.flush) == void);
801 
802 @safe unittest
803 {
804     import std.array;
805     auto app = appender!(char[]);
806     auto ostream = BufferedOutputRange!(typeof(app))(app, 5, 0, 10); // maxSize 10
807 
808     static assert(isOutputRange!(typeof(app), char));
809     static assert(!isFlushableOutputRange!(typeof(app), char));
810     static assert(!isFlushableOutputRange!(typeof(app)));
811 
812     static assert(isOutputRange!(typeof(ostream), char));
813     static assert(isFlushableOutputRange!(typeof(ostream), char));
814     static assert(isFlushableOutputRange!(typeof(ostream)));
815 
816     static assert(isOutputRange!(Appender!string, string));
817     static assert(!isFlushableOutputRange!(Appender!string, string));
818     static assert(!isFlushableOutputRange!(Appender!string));
819 
820     static assert(isOutputRange!(Appender!(char[]), char));
821     static assert(!isFlushableOutputRange!(Appender!(char[]), char));
822     static assert(!isFlushableOutputRange!(Appender!(char[])));
823 
824     static assert(isOutputRange!(BufferedOutputRange!(Appender!(char[])), char));
825     static assert(isFlushableOutputRange!(BufferedOutputRange!(Appender!(char[]))));
826     static assert(isFlushableOutputRange!(BufferedOutputRange!(Appender!(char[])), char));
827 }
828 
829 
830 /**
831 bufferedByLine is a performance enhancement over std.stdio.File.byLine. It works by
832 reading a large buffer from the input stream rather than just a single line.
833 
834 The file argument needs to be a File object open for reading, typically a filesystem
835 file or standard input. Use the Yes.keepTerminator template parameter to keep the
836 newline. This is similar to stdio.File.byLine, except specified as a template paramter
837 rather than a runtime parameter.
838 
839 Reading in blocks does mean that input is not read until a full buffer is available or
840 end-of-file is reached. For this reason, bufferedByLine is not appropriate for
841 interactive input.
842 */
843 
844 auto bufferedByLine(KeepTerminator keepTerminator = No.keepTerminator, Char = char,
845                     ubyte terminator = '\n', size_t readSize = 1024 * 128, size_t growSize = 1024 * 16)
846     (File file)
847 if (is(Char == char) || is(Char == ubyte))
848 {
849     static assert(0 < growSize && growSize <= readSize);
850 
851     static final class BufferedByLineImpl
852     {
853         /* Buffer state variables
854          *   - _buffer.length - Full length of allocated buffer.
855          *   - _dataEnd - End of currently valid data (end of last read).
856          *   - _lineStart - Start of current line.
857          *   - _lineEnd - End of current line.
858          */
859         private File _file;
860         private ubyte[] _buffer;
861         private size_t _lineStart = 0;
862         private size_t _lineEnd = 0;
863         private size_t _dataEnd = 0;
864 
865         this (File f)
866         {
867             _file = f;
868             _buffer = new ubyte[readSize + growSize];
869         }
870 
871         bool empty() const pure
872         {
873             return _file.eof && _lineStart == _dataEnd;
874         }
875 
876         Char[] front() pure
877         {
878             assert(!empty, "Attempt to take the front of an empty bufferedByLine.");
879 
880             static if (keepTerminator == Yes.keepTerminator)
881             {
882                 return cast(Char[]) _buffer[_lineStart .. _lineEnd];
883             }
884             else
885             {
886                 assert(_lineStart < _lineEnd);
887                 immutable end = (_buffer[_lineEnd - 1] == terminator) ? _lineEnd - 1 : _lineEnd;
888                 return cast(Char[]) _buffer[_lineStart .. end];
889             }
890         }
891 
892         /* Note: Call popFront at initialization to do the initial read. */
893         void popFront()
894         {
895             import std.algorithm: copy, find;
896             assert(!empty, "Attempt to popFront an empty bufferedByLine.");
897 
898             /* Pop the current line. */
899             _lineStart = _lineEnd;
900 
901             /* Set up the next line if more data is available, either in the buffer or
902              * the file. The next line ends at the next newline, if there is one.
903              *
904              * Notes:
905              * - 'find' returns the slice starting with the character searched for, or
906              *   an empty range if not found.
907              * - _lineEnd is set to _dataEnd both when the current buffer does not have
908              *   a newline and when it ends with one.
909              */
910             auto found = _buffer[_lineStart .. _dataEnd].find(terminator);
911             _lineEnd = found.empty ? _dataEnd : _dataEnd - found.length + 1;
912 
913             if (found.empty && !_file.eof)
914             {
915                 /* No newline in current buffer. Read from the file until the next
916                  * newline is found.
917                  */
918                 assert(_lineEnd == _dataEnd);
919 
920                 if (_lineStart > 0)
921                 {
922                     /* Move remaining data to the start of the buffer. */
923                     immutable remainingLength = _dataEnd - _lineStart;
924                     copy(_buffer[_lineStart .. _dataEnd], _buffer[0 .. remainingLength]);
925                     _lineStart = 0;
926                     _lineEnd = _dataEnd = remainingLength;
927                 }
928 
929                 do
930                 {
931                     /* Grow the buffer if necessary. */
932                     immutable availableSize = _buffer.length - _dataEnd;
933                     if (availableSize < readSize)
934                     {
935                         size_t growBy = growSize;
936                         while (availableSize + growBy < readSize) growBy += growSize;
937                         _buffer.length += growBy;
938                     }
939 
940                     /* Read the next block. */
941                     _dataEnd +=
942                         _file.rawRead(_buffer[_dataEnd .. _dataEnd + readSize])
943                         .length;
944 
945                     found = _buffer[_lineEnd .. _dataEnd].find(terminator);
946                     _lineEnd = found.empty ? _dataEnd : _dataEnd - found.length + 1;
947 
948                 } while (found.empty && !_file.eof);
949             }
950         }
951     }
952 
953     assert(file.isOpen, "bufferedByLine passed a closed file.");
954 
955     auto r = new BufferedByLineImpl(file);
956     if (!r.empty) r.popFront;
957     return r;
958 }
959 
960 // BufferedByLine.
961 unittest
962 {
963     import std.array : appender;
964     import std.conv : to;
965     import std.file : rmdirRecurse, readText;
966     import std.path : buildPath;
967     import std.range : lockstep;
968     import std.stdio;
969     import tsv_utils.common.unittest_utils;
970 
971     auto testDir = makeUnittestTempDir("tsv_utils_buffered_byline");
972     scope(exit) testDir.rmdirRecurse;
973 
974     /* Create two data files with the same data. Read both in parallel with byLine and
975      * bufferedByLine and compare each line.
976      */
977     auto data1 = appender!(char[])();
978 
979     foreach (i; 1 .. 1001) data1.put('\n');
980     foreach (i; 1 .. 1001) data1.put("a\n");
981     foreach (i; 1 .. 1001) { data1.put(i.to!string); data1.put('\n'); }
982     foreach (i; 1 .. 1001)
983     {
984         foreach (j; 1 .. i+1) data1.put('x');
985         data1.put('\n');
986     }
987 
988     string file1a = buildPath(testDir, "file1a.txt");
989     string file1b = buildPath(testDir, "file1b.txt");
990     {
991 
992         file1a.File("w").write(data1.data);
993         file1b.File("w").write(data1.data);
994     }
995 
996     /* Default parameters. */
997     {
998         auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator);
999         auto f1bIn = file1b.File().byLine(No.keepTerminator);
1000         foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b);
1001     }
1002     {
1003         auto f1aIn = file1a.File().bufferedByLine!(Yes.keepTerminator);
1004         auto f1bIn = file1b.File().byLine(Yes.keepTerminator);
1005         foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b);
1006     }
1007 
1008     /* Smaller read size. This will trigger buffer growth. */
1009     {
1010         auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator, char, '\n', 512, 256);
1011         auto f1bIn = file1b.File().byLine(No.keepTerminator);
1012         foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b);
1013     }
1014 
1015     /* Exercise boundary cases in buffer growth.
1016      * Note: static-foreach requires DMD 2.076 / LDC 1.6
1017      */
1018     static foreach (readSize; [1, 2, 4])
1019     {
1020         static foreach (growSize; 1 .. readSize + 1)
1021         {{
1022             auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize);
1023             auto f1bIn = file1b.File().byLine(No.keepTerminator);
1024             foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b);
1025         }}
1026         static foreach (growSize; 1 .. readSize + 1)
1027         {{
1028             auto f1aIn = file1a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize);
1029             auto f1bIn = file1b.File().byLine(Yes.keepTerminator);
1030             foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b);
1031         }}
1032     }
1033 
1034 
1035     /* Files that do not end in a newline. */
1036 
1037     string file2a = buildPath(testDir, "file2a.txt");
1038     string file2b = buildPath(testDir, "file2b.txt");
1039     string file3a = buildPath(testDir, "file3a.txt");
1040     string file3b = buildPath(testDir, "file3b.txt");
1041     string file4a = buildPath(testDir, "file4a.txt");
1042     string file4b = buildPath(testDir, "file4b.txt");
1043     {
1044         file1a.File("w").write("a");
1045         file1b.File("w").write("a");
1046         file2a.File("w").write("ab");
1047         file2b.File("w").write("ab");
1048         file3a.File("w").write("abc");
1049         file3b.File("w").write("abc");
1050     }
1051 
1052     static foreach (readSize; [1, 2, 4])
1053     {
1054         static foreach (growSize; 1 .. readSize + 1)
1055         {{
1056             auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize);
1057             auto f1bIn = file1b.File().byLine(No.keepTerminator);
1058             foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b);
1059 
1060             auto f2aIn = file2a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize);
1061             auto f2bIn = file2b.File().byLine(No.keepTerminator);
1062             foreach (a, b; lockstep(f2aIn, f2bIn, StoppingPolicy.requireSameLength)) assert(a == b);
1063 
1064             auto f3aIn = file3a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize);
1065             auto f3bIn = file3b.File().byLine(No.keepTerminator);
1066             foreach (a, b; lockstep(f3aIn, f3bIn, StoppingPolicy.requireSameLength)) assert(a == b);
1067         }}
1068         static foreach (growSize; 1 .. readSize + 1)
1069         {{
1070             auto f1aIn = file1a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize);
1071             auto f1bIn = file1b.File().byLine(Yes.keepTerminator);
1072             foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b);
1073 
1074             auto f2aIn = file2a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize);
1075             auto f2bIn = file2b.File().byLine(Yes.keepTerminator);
1076             foreach (a, b; lockstep(f2aIn, f2bIn, StoppingPolicy.requireSameLength)) assert(a == b);
1077 
1078             auto f3aIn = file3a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize);
1079             auto f3bIn = file3b.File().byLine(Yes.keepTerminator);
1080             foreach (a, b; lockstep(f3aIn, f3bIn, StoppingPolicy.requireSameLength)) assert(a == b);
1081         }}
1082     }
1083 }
1084 
1085 /**
1086 joinAppend performs a join operation on an input range, appending the results to
1087 an output range.
1088 
1089 joinAppend was written as a performance enhancement over using std.algorithm.joiner
1090 or std.array.join with writeln. Using joiner with writeln is quite slow, 3-4x slower
1091 than std.array.join with writeln. The joiner performance may be due to interaction
1092 with writeln, this was not investigated. Using joiner with stdout.lockingTextWriter
1093 is better, but still substantially slower than join. Using join works reasonably well,
1094 but is allocating memory unnecessarily.
1095 
1096 Using joinAppend with Appender is a bit faster than join, and allocates less memory.
1097 The Appender re-uses the underlying data buffer, saving memory. The example below
1098 illustrates. It is a modification of the InputFieldReordering example. The role
1099 Appender plus joinAppend are playing is to buffer the output. BufferedOutputRange
1100 uses a similar technique to buffer multiple lines.
1101 
1102 Note: The original uses joinAppend have been replaced by BufferedOutputRange, which has
1103 its own joinAppend method. However, joinAppend remains useful when constructing internal
1104 buffers where BufferedOutputRange is not appropriate.
1105 
1106 ---
1107 int main(string[] args)
1108 {
1109     import tsvutil;
1110     import std.algorithm, std.array, std.range, std.stdio;
1111     size_t[] fieldIndicies = [3, 0, 2];
1112     auto fieldReordering = new InputFieldReordering!char(fieldIndicies);
1113     auto outputBuffer = appender!(char[]);
1114     foreach (line; stdin.byLine)
1115     {
1116         fieldReordering.initNewLine;
1117         foreach(fieldIndex, fieldValue; line.splitter('\t').enumerate)
1118         {
1119             fieldReordering.processNextField(fieldIndex, fieldValue);
1120             if (fieldReordering.allFieldsFilled) break;
1121         }
1122         if (fieldReordering.allFieldsFilled)
1123         {
1124             outputBuffer.clear;
1125             writeln(fieldReordering.outputFields.joinAppend(outputBuffer, ('\t')));
1126         }
1127         else
1128         {
1129             writeln("Error: Insufficient number of field on the line.");
1130         }
1131     }
1132     return 0;
1133 }
1134 ---
1135 */
1136 OutputRange joinAppend(InputRange, OutputRange, E)
1137     (InputRange inputRange, ref OutputRange outputRange, E delimiter)
1138 if (isInputRange!InputRange &&
1139     (is(ElementType!InputRange : const E[]) &&
1140      isOutputRange!(OutputRange, E[]))
1141      ||
1142     (is(ElementType!InputRange : const E) &&
1143      isOutputRange!(OutputRange, E))
1144     )
1145 {
1146     if (!inputRange.empty)
1147     {
1148         outputRange.put(inputRange.front);
1149         inputRange.popFront;
1150     }
1151     foreach (x; inputRange)
1152     {
1153         outputRange.put(delimiter);
1154         outputRange.put(x);
1155     }
1156     return outputRange;
1157 }
1158 
1159 // joinAppend.
1160 @safe unittest
1161 {
1162     import std.array : appender;
1163     import std.algorithm : equal;
1164 
1165     char[] c1 = ['a', 'b', 'c'];
1166     char[] c2 = ['d', 'e', 'f'];
1167     char[] c3 = ['g', 'h', 'i'];
1168     auto cvec = [c1, c2, c3];
1169 
1170     auto s1 = "abc";
1171     auto s2 = "def";
1172     auto s3 = "ghi";
1173     auto svec = [s1, s2, s3];
1174 
1175     auto charAppender = appender!(char[])();
1176 
1177     assert(cvec.joinAppend(charAppender, '_').data == "abc_def_ghi");
1178     assert(equal(cvec, [c1, c2, c3]));
1179 
1180     charAppender.put('$');
1181     assert(svec.joinAppend(charAppender, '|').data == "abc_def_ghi$abc|def|ghi");
1182     assert(equal(cvec, [s1, s2, s3]));
1183 
1184     charAppender.clear;
1185     assert(svec.joinAppend(charAppender, '|').data == "abc|def|ghi");
1186 
1187     auto intAppender = appender!(int[])();
1188 
1189     auto i1 = [100, 101, 102];
1190     auto i2 = [200, 201, 202];
1191     auto i3 = [300, 301, 302];
1192     auto ivec = [i1, i2, i3];
1193 
1194     assert(ivec.joinAppend(intAppender, 0).data ==
1195            [100, 101, 102, 0, 200, 201, 202, 0, 300, 301, 302]);
1196 
1197     intAppender.clear;
1198     assert(i1.joinAppend(intAppender, 0).data ==
1199            [100, 0, 101, 0, 102]);
1200     assert(i2.joinAppend(intAppender, 1).data ==
1201            [100, 0, 101, 0, 102,
1202             200, 1, 201, 1, 202]);
1203     assert(i3.joinAppend(intAppender, 2).data ==
1204            [100, 0, 101, 0, 102,
1205             200, 1, 201, 1, 202,
1206             300, 2, 301, 2, 302]);
1207 }
1208 
1209 /**
1210 getTsvFieldValue extracts the value of a single field from a delimited text string.
1211 
1212 This is a convenience function intended for cases when only a single field from an
1213 input line is needed. If multiple values are needed, it will be more efficient to
1214 work directly with std.algorithm.splitter or the InputFieldReordering class.
1215 
1216 The input text is split by a delimiter character. The specified field is converted
1217 to the desired type and the value returned.
1218 
1219 An exception is thrown if there are not enough fields on the line or if conversion
1220 fails. Conversion is done with std.conv.to, it throws a std.conv.ConvException on
1221 failure. If not enough fields, the exception text is generated referencing 1-upped
1222 field numbers as would be provided by command line users.
1223  */
1224 T getTsvFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim)
1225 if (isSomeChar!C)
1226 {
1227     import std.algorithm : splitter;
1228     import std.conv : to;
1229     import std.format : format;
1230     import std.range;
1231 
1232     auto splitLine = line.splitter(delim);
1233     size_t atField = 0;
1234 
1235     while (atField < fieldIndex && !splitLine.empty)
1236     {
1237         splitLine.popFront;
1238         atField++;
1239     }
1240 
1241     T val;
1242     if (splitLine.empty)
1243     {
1244         if (fieldIndex == 0)
1245         {
1246             /* This is a workaround to a splitter special case - If the input is empty,
1247              * the returned split range is empty. This doesn't properly represent a single
1248              * column file. More correct mathematically, and for this case, would be a
1249              * single value representing an empty string. The input line is a convenient
1250              * source of an empty line. Info:
1251              *   Bug: https://issues.dlang.org/show_bug.cgi?id=15735
1252              *   Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030
1253              */
1254             assert(line.empty);
1255             val = line.to!T;
1256         }
1257         else
1258         {
1259             throw new Exception(
1260                 format("Not enough fields on line. Number required: %d; Number found: %d",
1261                        fieldIndex + 1, atField));
1262         }
1263     }
1264     else
1265     {
1266         val = splitLine.front.to!T;
1267     }
1268 
1269     return val;
1270 }
1271 
1272 // getTsvFieldValue.
1273 @safe unittest
1274 {
1275     import std.conv : ConvException, to;
1276     import std.exception;
1277 
1278     /* Common cases. */
1279     assert(getTsvFieldValue!double("123", 0, '\t') == 123.0);
1280     assert(getTsvFieldValue!double("-10.5", 0, '\t') == -10.5);
1281     assert(getTsvFieldValue!size_t("abc|123", 1, '|') == 123);
1282     assert(getTsvFieldValue!int("紅\t红\t99", 2, '\t') == 99);
1283     assert(getTsvFieldValue!int("紅\t红\t99", 2, '\t') == 99);
1284     assert(getTsvFieldValue!string("紅\t红\t99", 2, '\t') == "99");
1285     assert(getTsvFieldValue!string("紅\t红\t99", 1, '\t') == "红");
1286     assert(getTsvFieldValue!string("紅\t红\t99", 0, '\t') == "紅");
1287     assert(getTsvFieldValue!string("红色和绿色\tred and green\t赤と緑\t10.5", 2, '\t') == "赤と緑");
1288     assert(getTsvFieldValue!double("红色和绿色\tred and green\t赤と緑\t10.5", 3, '\t') == 10.5);
1289 
1290     /* The empty field cases. */
1291     assert(getTsvFieldValue!string("", 0, '\t') == "");
1292     assert(getTsvFieldValue!string("\t", 0, '\t') == "");
1293     assert(getTsvFieldValue!string("\t", 1, '\t') == "");
1294     assert(getTsvFieldValue!string("", 0, ':') == "");
1295     assert(getTsvFieldValue!string(":", 0, ':') == "");
1296     assert(getTsvFieldValue!string(":", 1, ':') == "");
1297 
1298     /* Tests with different data types. */
1299     string stringLine = "orange and black\tნარინჯისფერი და შავი\t88.5";
1300     char[] charLine = "orange and black\tნარინჯისფერი და შავი\t88.5".to!(char[]);
1301     dchar[] dcharLine = stringLine.to!(dchar[]);
1302     wchar[] wcharLine = stringLine.to!(wchar[]);
1303 
1304     assert(getTsvFieldValue!string(stringLine, 0, '\t') == "orange and black");
1305     assert(getTsvFieldValue!string(stringLine, 1, '\t') == "ნარინჯისფერი და შავი");
1306     assert(getTsvFieldValue!wstring(stringLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring);
1307     assert(getTsvFieldValue!double(stringLine, 2, '\t') == 88.5);
1308 
1309     assert(getTsvFieldValue!string(charLine, 0, '\t') == "orange and black");
1310     assert(getTsvFieldValue!string(charLine, 1, '\t') == "ნარინჯისფერი და შავი");
1311     assert(getTsvFieldValue!wstring(charLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring);
1312     assert(getTsvFieldValue!double(charLine, 2, '\t') == 88.5);
1313 
1314     assert(getTsvFieldValue!string(dcharLine, 0, '\t') == "orange and black");
1315     assert(getTsvFieldValue!string(dcharLine, 1, '\t') == "ნარინჯისფერი და შავი");
1316     assert(getTsvFieldValue!wstring(dcharLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring);
1317     assert(getTsvFieldValue!double(dcharLine, 2, '\t') == 88.5);
1318 
1319     assert(getTsvFieldValue!string(wcharLine, 0, '\t') == "orange and black");
1320     assert(getTsvFieldValue!string(wcharLine, 1, '\t') == "ნარინჯისფერი და შავი");
1321     assert(getTsvFieldValue!wstring(wcharLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring);
1322     assert(getTsvFieldValue!double(wcharLine, 2, '\t') == 88.5);
1323 
1324     /* Conversion errors. */
1325     assertThrown!ConvException(getTsvFieldValue!double("", 0, '\t'));
1326     assertThrown!ConvException(getTsvFieldValue!double("abc", 0, '|'));
1327     assertThrown!ConvException(getTsvFieldValue!size_t("-1", 0, '|'));
1328     assertThrown!ConvException(getTsvFieldValue!size_t("a23|23.4", 1, '|'));
1329     assertThrown!ConvException(getTsvFieldValue!double("23.5|def", 1, '|'));
1330 
1331     /* Not enough field errors. These should throw, but not a ConvException.*/
1332     assertThrown(assertNotThrown!ConvException(getTsvFieldValue!double("", 1, '\t')));
1333     assertThrown(assertNotThrown!ConvException(getTsvFieldValue!double("abc", 1, '\t')));
1334     assertThrown(assertNotThrown!ConvException(getTsvFieldValue!double("abc\tdef", 2, '\t')));
1335 }
1336 
1337 /** [Yes|No.newlineWasRemoved] is a template parameter to throwIfWindowsNewlineOnUnix.
1338  *  A Yes value indicates the Unix newline was already removed, as might be done via
1339  *  std.File.byLine or similar mechanism.
1340  */
1341 alias NewlineWasRemoved = Flag!"newlineWasRemoved";
1342 
1343 /**
1344 throwIfWindowsLineNewlineOnUnix is used to throw an exception if a Windows/DOS
1345 line ending is found on a build compiled for a Unix platform. This is used by
1346 the TSV Utilities to detect Window/DOS line endings and terminate processing
1347 with an error message to the user.
1348  */
1349 void throwIfWindowsNewlineOnUnix
1350     (NewlineWasRemoved nlWasRemoved = Yes.newlineWasRemoved)
1351     (const char[] line, const char[] filename, size_t lineNum)
1352 {
1353     version(Posix)
1354     {
1355         static if (nlWasRemoved)
1356         {
1357             immutable bool hasWindowsLineEnding = line.length != 0 && line[$ - 1] == '\r';
1358         }
1359         else
1360         {
1361             immutable bool hasWindowsLineEnding =
1362                 line.length > 1 &&
1363                 line[$ - 2] == '\r' &&
1364                 line[$ - 1] == '\n';
1365         }
1366 
1367         if (hasWindowsLineEnding)
1368         {
1369             import std.format;
1370             throw new Exception(
1371                 format("Windows/DOS line ending found. Convert file to Unix newlines before processing (e.g. 'dos2unix').\n  File: %s, Line: %s",
1372                        (filename == "-") ? "Standard Input" : filename, lineNum));
1373         }
1374     }
1375 }
1376 
1377 // throwIfWindowsNewlineOnUnix
1378 @safe unittest
1379 {
1380     /* Note: Currently only building on Posix. Need to add non-Posix test cases
1381      * if Windows builds are ever done.
1382      */
1383     version(Posix)
1384     {
1385         import std.exception;
1386 
1387         assertNotThrown(throwIfWindowsNewlineOnUnix("", "afile.tsv", 1));
1388         assertNotThrown(throwIfWindowsNewlineOnUnix("a", "afile.tsv", 2));
1389         assertNotThrown(throwIfWindowsNewlineOnUnix("ab", "afile.tsv", 3));
1390         assertNotThrown(throwIfWindowsNewlineOnUnix("abc", "afile.tsv", 4));
1391 
1392         assertThrown(throwIfWindowsNewlineOnUnix("\r", "afile.tsv", 1));
1393         assertThrown(throwIfWindowsNewlineOnUnix("a\r", "afile.tsv", 2));
1394         assertThrown(throwIfWindowsNewlineOnUnix("ab\r", "afile.tsv", 3));
1395         assertThrown(throwIfWindowsNewlineOnUnix("abc\r", "afile.tsv", 4));
1396 
1397         assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("\n", "afile.tsv", 1));
1398         assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("a\n", "afile.tsv", 2));
1399         assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("ab\n", "afile.tsv", 3));
1400         assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("abc\n", "afile.tsv", 4));
1401 
1402         assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("\r\n", "afile.tsv", 5));
1403         assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("a\r\n", "afile.tsv", 6));
1404         assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("ab\r\n", "afile.tsv", 7));
1405         assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("abc\r\n", "afile.tsv", 8));
1406 
1407         /* Standard Input formatting. */
1408         import std.algorithm : endsWith;
1409         bool exceptionCaught = false;
1410 
1411         try (throwIfWindowsNewlineOnUnix("\r", "-", 99));
1412         catch (Exception e)
1413         {
1414             assert(e.msg.endsWith("File: Standard Input, Line: 99"));
1415             exceptionCaught = true;
1416         }
1417         finally
1418         {
1419             assert(exceptionCaught);
1420             exceptionCaught = false;
1421         }
1422 
1423         try (throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("\r\n", "-", 99));
1424         catch (Exception e)
1425         {
1426             assert(e.msg.endsWith("File: Standard Input, Line: 99"));
1427             exceptionCaught = true;
1428         }
1429         finally
1430         {
1431             assert(exceptionCaught);
1432             exceptionCaught = false;
1433         }
1434     }
1435 }
1436 
1437 /** Flag used by InputSourceRange to determine if the header line should be when
1438 opening a file.
1439 */
1440 alias ReadHeader = Flag!"readHeader";
1441 
1442 /**
1443 inputSourceRange is a helper function for creating new InputSourceRange objects.
1444 */
1445 InputSourceRange inputSourceRange(string[] filepaths, ReadHeader readHeader)
1446 {
1447     return new InputSourceRange(filepaths, readHeader);
1448 }
1449 
1450 /**
1451 InputSourceRange is an input range that iterates over a set of input files.
1452 
1453 InputSourceRange is used to iterate over a set of files passed on the command line.
1454 Files are automatically opened and closed during iteration. The caller can choose to
1455 have header lines read automatically.
1456 
1457 The range is created from a set of filepaths. These filepaths are mapped to
1458 InputSource objects during the iteration. This is what enables automatically opening
1459 and closing files and reading the header line.
1460 
1461 The motivation for an InputSourceRange is to provide a standard way to look at the
1462 header line of the first input file during command line argument processing, and then
1463 pass the open input file and the header line along to the main processing functions.
1464 This enables a features like named fields to be implemented in a standard way.
1465 
1466 Both InputSourceRange and InputSource are reference objects. This keeps their use
1467 limited to a single iteration over the set of files. The files can be iterated again
1468 by creating a new InputSourceRange against the same filepaths.
1469 
1470 Currently, InputSourceRange supports files and standard input. It is possible other
1471 types of input sources will be added in the future.
1472  */
1473 final class InputSourceRange
1474 {
1475     import std.range;
1476 
1477     private string[] _filepaths;
1478     private ReadHeader _readHeader;
1479     private InputSource _front;
1480 
1481     this(string[] filepaths, ReadHeader readHeader)
1482     {
1483         _filepaths = filepaths.dup;
1484         _readHeader = readHeader;
1485         _front = null;
1486 
1487         if (!_filepaths.empty)
1488         {
1489             _front = new InputSource(_filepaths.front, _readHeader);
1490             _front.open;
1491             _filepaths.popFront;
1492         }
1493     }
1494 
1495     size_t length() const pure nothrow @safe
1496     {
1497         return empty ? 0 : _filepaths.length + 1;
1498     }
1499 
1500     bool empty() const pure nothrow @safe
1501     {
1502         return _front is null;
1503     }
1504 
1505     InputSource front() pure @safe
1506     {
1507         assert(!empty, "Attempt to take the front of an empty InputSourceRange");
1508         return _front;
1509     }
1510 
1511     void popFront()
1512     {
1513         assert(!empty, "Attempt to popFront an empty InputSourceRange");
1514 
1515         _front.close;
1516 
1517         if (!_filepaths.empty)
1518         {
1519             _front = new InputSource(_filepaths.front, _readHeader);
1520             _front.open;
1521             _filepaths.popFront;
1522         }
1523         else
1524         {
1525             _front = null;
1526         }
1527     }
1528 }
1529 
1530 /**
1531 InputSource is a class of objects produced by iterating over an InputSourceRange.
1532 
1533 An InputSource object provides access to the open file currently the front element
1534 of an InputSourceRange. The main methods application code is likely to need are:
1535 
1536 $(LIST
1537     * `file()` - Returns the File object. The file will be open for reading as long
1538       InputSource instance is the front element of the InputSourceRange it came from.
1539 
1540     * `header(KeepTerminator keepTerminator = No.keepTerminator)` - Returns the
1541       header line from the file. An empty string is returned if InputSource range
1542       was created with readHeader=false.
1543 
1544     * `name()` - The name of the input source. The name returned is intended for
1545       user error messages. For files, this is the filepath that was passed to
1546       InputSourceRange. For standard input, it is "Standard Input".
1547 )
1548 
1549 An InputSource is a reference object, so the copies will retain the state of the
1550 InputSourceRange front element. In particular, all copies will have the open
1551 state of the front element of the InputSourceRange.
1552 
1553 This class is not intended for use outside the context of an InputSourceRange.
1554 */
1555 final class InputSource
1556 {
1557     import std.range;
1558     import std.stdio;
1559 
1560     private immutable string _filepath;
1561     private immutable bool _isStdin;
1562     private bool _isOpen;
1563     private ReadHeader _readHeader;
1564     private bool _hasBeenOpened;
1565     private string _header;
1566     private File _file;
1567 
1568     private this(string filepath, ReadHeader readHeader) pure nothrow @safe
1569     {
1570         _filepath = filepath;
1571         _isStdin = filepath == "-";
1572         _isOpen = false;
1573         _readHeader = readHeader;
1574         _hasBeenOpened = false;
1575     }
1576 
1577     /** file returns the File object held by the InputSource.
1578      *
1579      * The File will be open for reading as long as the InputSource instance is the
1580      * front element of the InputSourceRange it came from.
1581      */
1582     File file() nothrow @safe
1583     {
1584         return _file;
1585     }
1586 
1587     /** isReadHeaderEnabled returns true if the header line is being read.
1588      */
1589     bool isReadHeaderEnabled() const pure nothrow @safe
1590     {
1591         return _readHeader == Yes.readHeader;
1592     }
1593 
1594     /** header returns the header line from the input file.
1595      *
1596      * An empty string is returned if InputSource range was created with
1597      * readHeader=false.
1598      */
1599     string header(KeepTerminator keepTerminator = No.keepTerminator) const pure nothrow @safe
1600     {
1601         assert(_hasBeenOpened);
1602         return (keepTerminator == Yes.keepTerminator ||
1603                 _header.length == 0 ||
1604                 _header[$ - 1] != '\n') ?
1605             _header : _header[0 .. $-1];
1606     }
1607 
1608     /** isHeaderEmpty returns true if there is no data for a header, including the
1609      * terminator.
1610      *
1611      * When headers are being read, this true only if the file is empty.
1612      */
1613     bool isHeaderEmpty() const pure nothrow @safe
1614     {
1615         assert(_hasBeenOpened);
1616         return _header.empty;
1617     }
1618 
1619     /** name returns a user friendly name representing the input source.
1620      *
1621      * For files, it is the filepath provided to InputSourceRange. For standard
1622      * input, it is "Standard Input". (Use isStdin() to test for standard input,
1623      * not name().
1624      */
1625     string name() const pure nothrow @safe
1626     {
1627         return _isStdin ? "Standard Input" : _filepath;
1628     }
1629 
1630     /** isStdin returns true if the input source is Standard Input, false otherwise.
1631     */
1632     bool isStdin() const pure nothrow @safe
1633     {
1634         return _isStdin;
1635     }
1636 
1637     /** isOpen returns true if the input source is open for reading, false otherwise.
1638      *
1639      * "Open" in this context is whether the InputSource object is currently open,
1640      * meaning that it is the front element of the InputSourceRange that created it.
1641      *
1642      * For files, this is also reflected in the state of the underlying File object.
1643      * However, standard input is never actually closed.
1644      */
1645     bool isOpen() const pure nothrow @safe
1646     {
1647         return _isOpen;
1648     }
1649 
1650     private void open()
1651     {
1652         assert(!_isOpen);
1653         assert(!_hasBeenOpened);
1654 
1655         _file = isStdin ? stdin : _filepath.File("rb");
1656         if (_readHeader) _header = _file.readln;
1657         _isOpen = true;
1658         _hasBeenOpened = true;
1659     }
1660 
1661     private void close()
1662     {
1663         if (!_isStdin) _file.close;
1664         _isOpen = false;
1665     }
1666 }
1667 
1668 // InputSourceRange and InputSource
1669 unittest
1670 {
1671     import std.algorithm : all, each;
1672     import std.array : appender;
1673     import std.exception : assertThrown;
1674     import std.file : rmdirRecurse;
1675     import std.path : buildPath;
1676     import std.range;
1677     import std.stdio;
1678     import tsv_utils.common.unittest_utils;
1679 
1680     auto testDir = makeUnittestTempDir("tsv_utils_input_source_range");
1681     scope(exit) testDir.rmdirRecurse;
1682 
1683     string file0 = buildPath(testDir, "file0.txt");
1684     string file1 = buildPath(testDir, "file1.txt");
1685     string file2 = buildPath(testDir, "file2.txt");
1686     string file3 = buildPath(testDir, "file3.txt");
1687 
1688     string file0Header = "";
1689     string file1Header = "file 1 header\n";
1690     string file2Header = "file 2 header\n";
1691     string file3Header = "file 3 header\n";
1692 
1693     string file0Body = "";
1694     string file1Body = "";
1695     string file2Body = "file 2 line 1\n";
1696     string file3Body = "file 3 line 1\nfile 3 line 2\n";
1697 
1698     string file0Data = file0Header ~ file0Body;
1699     string file1Data = file1Header ~ file1Body;
1700     string file2Data = file2Header ~ file2Body;
1701     string file3Data = file3Header ~ file3Body;
1702 
1703     {
1704         file0.File("w").write(file0Data);
1705         file1.File("w").write(file1Data);
1706         file2.File("w").write(file2Data);
1707         file3.File("w").write(file3Data);
1708     }
1709 
1710     auto inputFiles = [file0, file1, file2, file3];
1711     auto fileHeaders = [file0Header, file1Header, file2Header, file3Header];
1712     auto fileBodies = [file0Body, file1Body, file2Body, file3Body];
1713     auto fileData = [file0Data, file1Data, file2Data, file3Data];
1714 
1715     auto readSources = appender!(InputSource[]);
1716     auto buffer = new char[1024];    // Must be large enough to hold the test files.
1717 
1718     /* Tests without standard input. Don't want to count on state of standard
1719      * input or modifying it when doing unit tests, so avoid reading from it.
1720      */
1721 
1722     foreach(numFiles; 1 .. inputFiles.length + 1)
1723     {
1724         /* Reading headers. */
1725 
1726         readSources.clear;
1727         auto inputSourcesYesHeader = inputSourceRange(inputFiles[0 .. numFiles], Yes.readHeader);
1728         assert(inputSourcesYesHeader.length == numFiles);
1729 
1730         foreach(fileNum, source; inputSourcesYesHeader.enumerate)
1731         {
1732             readSources.put(source);
1733             assert(source.isOpen);
1734             assert(source.file.isOpen);
1735             assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen));
1736             assert(readSources.data[fileNum].isOpen);
1737 
1738             assert(source.header(Yes.keepTerminator) == fileHeaders[fileNum]);
1739 
1740             auto headerNoTerminatorLength = fileHeaders[fileNum].length;
1741             if (headerNoTerminatorLength > 0) --headerNoTerminatorLength;
1742             assert(source.header(No.keepTerminator) ==
1743                    fileHeaders[fileNum][0 .. headerNoTerminatorLength]);
1744 
1745             assert(source.name == inputFiles[fileNum]);
1746             assert(!source.isStdin);
1747             assert(source.isReadHeaderEnabled);
1748 
1749             assert(source.file.rawRead(buffer) == fileBodies[fileNum]);
1750         }
1751 
1752         /* The InputSourceRange is a reference range, consumed by the foreach. */
1753         assert(inputSourcesYesHeader.empty);
1754 
1755         /* Without reading headers. */
1756 
1757         readSources.clear;
1758         auto inputSourcesNoHeader = inputSourceRange(inputFiles[0 .. numFiles], No.readHeader);
1759         assert(inputSourcesNoHeader.length == numFiles);
1760 
1761         foreach(fileNum, source; inputSourcesNoHeader.enumerate)
1762         {
1763             readSources.put(source);
1764             assert(source.isOpen);
1765             assert(source.file.isOpen);
1766             assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen));
1767             assert(readSources.data[fileNum].isOpen);
1768 
1769             assert(source.header(Yes.keepTerminator).empty);
1770             assert(source.header(No.keepTerminator).empty);
1771 
1772             assert(source.name == inputFiles[fileNum]);
1773             assert(!source.isStdin);
1774             assert(!source.isReadHeaderEnabled);
1775 
1776             assert(source.file.rawRead(buffer) == fileData[fileNum]);
1777         }
1778 
1779         /* The InputSourceRange is a reference range, consumed by the foreach. */
1780         assert(inputSourcesNoHeader.empty);
1781     }
1782 
1783     /* Tests with standard input. No actual reading in these tests.
1784      */
1785 
1786     readSources.clear;
1787     foreach(fileNum, source; inputSourceRange(["-", "-"], No.readHeader).enumerate)
1788     {
1789         readSources.put(source);
1790         assert(source.isOpen);
1791         assert(source.file.isOpen);
1792         assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen));      // InputSource objects are "closed".
1793         assert(readSources.data[0 .. fileNum].all!(s => s.file.isOpen));  // Actual stdin should not be closed.
1794         assert(readSources.data[fileNum].isOpen);
1795 
1796         assert(source.header(Yes.keepTerminator).empty);
1797         assert(source.header(No.keepTerminator).empty);
1798 
1799         assert(source.name == "Standard Input");
1800         assert(source.isStdin);
1801     }
1802 
1803     /* Empty filelist. */
1804     string[] nofiles;
1805     {
1806         auto sources = inputSourceRange(nofiles, No.readHeader);
1807         assert(sources.empty);
1808     }
1809     {
1810         auto sources = inputSourceRange(nofiles, Yes.readHeader);
1811         assert(sources.empty);
1812     }
1813 
1814     /* Error cases. */
1815     assertThrown(inputSourceRange([file0, "no_such_file.txt"], No.readHeader).each);
1816     assertThrown(inputSourceRange(["no_such_file.txt", file1], Yes.readHeader).each);
1817 }
1818 
1819 /**
1820 byLineSourceRange is a helper function for creating new byLineSourceRange objects.
1821 */
1822 auto byLineSourceRange(
1823     KeepTerminator keepTerminator = No.keepTerminator, Char = char, ubyte terminator = '\n')
1824 (string[] filepaths)
1825 if (is(Char == char) || is(Char == ubyte))
1826 {
1827     return new ByLineSourceRange!(keepTerminator, Char, terminator)(filepaths);
1828 }
1829 
1830 /**
1831 ByLineSourceRange is an input range that iterates over a set of input files. It
1832 provides bufferedByLine access to each file.
1833 
1834 A ByLineSourceRange is used to iterate over a set of files passed on the command line.
1835 Files are automatically opened and closed during iteration. The front element of the
1836 range provides access to a bufferedByLine for iterating over the lines in the file.
1837 
1838 The range is created from a set of filepaths. These filepaths are mapped to
1839 ByLineSource objects during the iteration. This is what enables automatically opening
1840 and closing files and providing bufferedByLine access.
1841 
1842 The motivation behind ByLineSourceRange is to provide a standard way to look at the
1843 header line of the first input file during command line argument processing, and then
1844 pass the open input file along to the main processing functions. This enables
1845 features like named fields to be implemented in a standard way.
1846 
1847 Access to the first line of the first file is available after creating the
1848 ByLineSourceRange instance. The first file is opened and a bufferedByLine created.
1849 The first line of the first file is via byLine.front (after checking !byLine.empty).
1850 
1851 Both ByLineSourceRange and ByLineSource are reference objects. This keeps their use
1852 limited to a single iteration over the set of files. The files can be iterated again
1853 by creating a new InputSourceRange against the same filepaths.
1854 
1855 Currently, ByLineSourceRange supports files and standard input. It is possible other
1856 types of input sources will be added in the future.
1857  */
1858 final class ByLineSourceRange(
1859     KeepTerminator keepTerminator = No.keepTerminator, Char = char, ubyte terminator = '\n')
1860 if (is(Char == char) || is(Char == ubyte))
1861 {
1862     import std.range;
1863 
1864     alias ByLineSourceType = ByLineSource!(keepTerminator, char, terminator);
1865 
1866     private string[] _filepaths;
1867     private ByLineSourceType _front;
1868 
1869     this(string[] filepaths)
1870     {
1871         _filepaths = filepaths.dup;
1872         _front = null;
1873 
1874         if (!_filepaths.empty)
1875         {
1876             _front = new ByLineSourceType(_filepaths.front);
1877             _front.open;
1878             _filepaths.popFront;
1879         }
1880     }
1881 
1882     size_t length() const pure nothrow @safe
1883     {
1884         return empty ? 0 : _filepaths.length + 1;
1885     }
1886 
1887     bool empty() const pure nothrow @safe
1888     {
1889         return _front is null;
1890     }
1891 
1892     ByLineSourceType front() pure @safe
1893     {
1894         assert(!empty, "Attempt to take the front of an empty ByLineSourceRange");
1895         return _front;
1896     }
1897 
1898     void popFront()
1899     {
1900         assert(!empty, "Attempt to popFront an empty ByLineSourceRange");
1901 
1902         _front.close;
1903 
1904         if (!_filepaths.empty)
1905         {
1906             _front = new ByLineSourceType(_filepaths.front);
1907             _front.open;
1908             _filepaths.popFront;
1909         }
1910         else
1911         {
1912             _front = null;
1913         }
1914     }
1915 }
1916 
1917 /**
1918 ByLineSource is a class of objects produced by iterating over an ByLineSourceRange.
1919 
1920 A ByLineSource instance provides a bufferedByLine range for the current the front
1921 element of a ByLineSourceRange. The main methods application code is likely to
1922 need are:
1923 
1924 $(LIST
1925     * `byLine()` - Returns the bufferedByLine range accessing the open file. The file
1926        will be open for reading (using the bufferedByLine range) as long as the
1927        ByLineSource instance is the front element of the ByLineSourceRange
1928        it came from.
1929 
1930     * `name()` - The name of the input source. The name returned is intended for
1931       user error messages. For files, this is the filepath that was passed to
1932       ByLineSourceRange. For standard input, it is "Standard Input".
1933 )
1934 
1935 A ByLineSource is a reference object, so the copies have the same state as the
1936 ByLineSourceRange front element. In particular, all copies will have the open
1937 state of the front element of the ByLineSourceRange.
1938 
1939 This class is not intended for use outside the context of an ByLineSourceRange.
1940 */
1941 final class ByLineSource(
1942     KeepTerminator keepTerminator, Char = char, ubyte terminator = '\n')
1943 if (is(Char == char) || is(Char == ubyte))
1944 {
1945     import std.range;
1946     import std.stdio;
1947     import std.traits : ReturnType;
1948 
1949     alias newByLineFn = bufferedByLine!(keepTerminator, char, terminator);
1950     alias ByLineType = ReturnType!newByLineFn;
1951 
1952     private immutable string _filepath;
1953     private immutable bool _isStdin;
1954     private bool _isOpen;
1955     private bool _hasBeenOpened;
1956     private File _file;
1957     private ByLineType _byLineRange;
1958 
1959     private this(string filepath) pure nothrow @safe
1960     {
1961         _filepath = filepath;
1962         _isStdin = filepath == "-";
1963         _isOpen = false;
1964         _hasBeenOpened = false;
1965     }
1966 
1967     /** byLine returns the bufferedByLine object held by the ByLineSource instance.
1968      *
1969      * The File underlying the BufferedByLine object is open for reading as long as
1970      * the ByLineSource instance is the front element of the ByLineSourceRange it
1971      * came from.
1972      */
1973     ByLineType byLine() nothrow @safe
1974     {
1975         return _byLineRange;
1976     }
1977 
1978     /** name returns a user friendly name representing the underlying input source.
1979      *
1980      * For files, it is the filepath provided to ByLineSourceRange. For standard
1981      * input, it is "Standard Input". (Use isStdin() to test for standard input,
1982      * compare against name().)
1983      */
1984     string name() const pure nothrow @safe
1985     {
1986         return _isStdin ? "Standard Input" : _filepath;
1987     }
1988 
1989     /** isStdin returns true if the underlying input source is Standard Input, false
1990      * otherwise.
1991      */
1992     bool isStdin() const pure nothrow @safe
1993     {
1994         return _isStdin;
1995     }
1996 
1997     /** isOpen returns true if the ByLineSource instance is open for reading, false
1998      * otherwise.
1999      *
2000      * "Open" in this context is whether the ByLineSource object is currently "open".
2001      * The underlying input source backing it does not necessarily have the same
2002      * state. The ByLineSource instance is "open" if is the front element of the
2003      * ByLineSourceRange that created it.
2004      *
2005      * The underlying input source object follows the same open/close state as makes
2006      * sense. In particular, real files are closed when the ByLineSource object is
2007      * closed. The exception is standard input, which is never actually closed.
2008      */
2009     bool isOpen() const pure nothrow @safe
2010     {
2011         return _isOpen;
2012     }
2013 
2014     private void open()
2015     {
2016         assert(!_isOpen);
2017         assert(!_hasBeenOpened);
2018 
2019         _file = isStdin ? stdin : _filepath.File("rb");
2020         _byLineRange = newByLineFn(_file);
2021         _isOpen = true;
2022         _hasBeenOpened = true;
2023     }
2024 
2025     private void close()
2026     {
2027         if (!_isStdin) _file.close;
2028         _isOpen = false;
2029     }
2030 }
2031 
2032 // ByLineSourceRange and ByLineSource
2033 unittest
2034 {
2035     import std.algorithm : all, each;
2036     import std.array : appender;
2037     import std.exception : assertThrown;
2038     import std.file : rmdirRecurse;
2039     import std.path : buildPath;
2040     import std.range;
2041     import std.stdio;
2042     import tsv_utils.common.unittest_utils;
2043 
2044     auto testDir = makeUnittestTempDir("tsv_utils_byline_input_source_range");
2045     scope(exit) testDir.rmdirRecurse;
2046 
2047     string file0 = buildPath(testDir, "file0.txt");
2048     string file1 = buildPath(testDir, "file1.txt");
2049     string file2 = buildPath(testDir, "file2.txt");
2050     string file3 = buildPath(testDir, "file3.txt");
2051 
2052     string file0Header = "";
2053     string file1Header = "file 1 header\n";
2054     string file2Header = "file 2 header\n";
2055     string file3Header = "file 3 header\n";
2056 
2057     string file0Body = "";
2058     string file1Body = "";
2059     string file2Body = "file 2 line 1\n";
2060     string file3Body = "file 3 line 1\nfile 3 line 2\n";
2061 
2062     string file0Data = file0Header ~ file0Body;
2063     string file1Data = file1Header ~ file1Body;
2064     string file2Data = file2Header ~ file2Body;
2065     string file3Data = file3Header ~ file3Body;
2066 
2067     {
2068         file0.File("w").write(file0Data);
2069         file1.File("w").write(file1Data);
2070         file2.File("w").write(file2Data);
2071         file3.File("w").write(file3Data);
2072     }
2073 
2074     auto inputFiles = [file0, file1, file2, file3];
2075     auto fileHeaders = [file0Header, file1Header, file2Header, file3Header];
2076     auto fileBodies = [file0Body, file1Body, file2Body, file3Body];
2077     auto fileData = [file0Data, file1Data, file2Data, file3Data];
2078 
2079     auto buffer = new char[1024];    // Must be large enough to hold the test files.
2080 
2081     /* Tests without standard input. Don't want to count on state of standard
2082      * input or modifying it when doing unit tests, so avoid reading from it.
2083      */
2084 
2085     auto readSourcesNoTerminator = appender!(ByLineSource!(No.keepTerminator)[]);
2086     auto readSourcesYesTerminator = appender!(ByLineSource!(Yes.keepTerminator)[]);
2087 
2088     foreach(numFiles; 1 .. inputFiles.length + 1)
2089     {
2090         /* Using No.keepTerminator. */
2091         readSourcesNoTerminator.clear;
2092         auto inputSourcesNoTerminator = byLineSourceRange!(No.keepTerminator)(inputFiles[0 .. numFiles]);
2093         assert(inputSourcesNoTerminator.length == numFiles);
2094 
2095         foreach(fileNum, source; inputSourcesNoTerminator.enumerate)
2096         {
2097             readSourcesNoTerminator.put(source);
2098             assert(source.isOpen);
2099             assert(source._file.isOpen);
2100             assert(readSourcesNoTerminator.data[0 .. fileNum].all!(s => !s.isOpen));
2101             assert(readSourcesNoTerminator.data[fileNum].isOpen);
2102 
2103             auto headerNoTerminatorLength = fileHeaders[fileNum].length;
2104             if (headerNoTerminatorLength > 0) --headerNoTerminatorLength;
2105 
2106             assert(source.byLine.empty ||
2107                    source.byLine.front == fileHeaders[fileNum][0 .. headerNoTerminatorLength]);
2108 
2109             assert(source.name == inputFiles[fileNum]);
2110             assert(!source.isStdin);
2111 
2112             auto readFileData = appender!(char[]);
2113             foreach(line; source.byLine)
2114             {
2115                 readFileData.put(line);
2116                 readFileData.put('\n');
2117             }
2118 
2119             assert(readFileData.data == fileData[fileNum]);
2120         }
2121 
2122         /* The ByLineSourceRange is a reference range, consumed by the foreach. */
2123         assert(inputSourcesNoTerminator.empty);
2124 
2125         /* Using Yes.keepTerminator. */
2126         readSourcesYesTerminator.clear;
2127         auto inputSourcesYesTerminator = byLineSourceRange!(Yes.keepTerminator)(inputFiles[0 .. numFiles]);
2128         assert(inputSourcesYesTerminator.length == numFiles);
2129 
2130         foreach(fileNum, source; inputSourcesYesTerminator.enumerate)
2131         {
2132             readSourcesYesTerminator.put(source);
2133             assert(source.isOpen);
2134             assert(source._file.isOpen);
2135             assert(readSourcesYesTerminator.data[0 .. fileNum].all!(s => !s.isOpen));
2136             assert(readSourcesYesTerminator.data[fileNum].isOpen);
2137 
2138             assert(source.byLine.empty || source.byLine.front == fileHeaders[fileNum]);
2139 
2140             assert(source.name == inputFiles[fileNum]);
2141             assert(!source.isStdin);
2142 
2143             auto readFileData = appender!(char[]);
2144             foreach(line; source.byLine)
2145             {
2146                 readFileData.put(line);
2147             }
2148 
2149             assert(readFileData.data == fileData[fileNum]);
2150         }
2151 
2152         /* The ByLineSourceRange is a reference range, consumed by the foreach. */
2153         assert(inputSourcesYesTerminator.empty);
2154     }
2155 
2156     /* Empty filelist. */
2157     string[] nofiles;
2158     {
2159         auto sources = byLineSourceRange!(No.keepTerminator)(nofiles);
2160         assert(sources.empty);
2161     }
2162     {
2163         auto sources = byLineSourceRange!(Yes.keepTerminator)(nofiles);
2164         assert(sources.empty);
2165     }
2166 
2167     /* Error cases. */
2168     assertThrown(byLineSourceRange!(No.keepTerminator)([file0, "no_such_file.txt"]).each);
2169     assertThrown(byLineSourceRange!(Yes.keepTerminator)(["no_such_file.txt", file1]).each);
2170 }