tsv_sample source code

1 /**
2 Command line tool implementing weighted reservoir sampling on delimited data files.
3 Weights are read from a field in the file.
4 
5 Copyright (c) 2017-2018, eBay Software Foundation
6 Initially written by Jon Degenhardt
7 
8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
9 */
10 module tsv_sample;
11 
12 import std.range;
13 import std.stdio;
14 import std.typecons : tuple, Flag;
15 
16 version(unittest)
17 {
18     // When running unit tests, use main from -main compiler switch.
19 }
20 else
21 {
22     int main(string[] cmdArgs)
23     {
24         /* When running in DMD code coverage mode, turn on report merging. */
25         version(D_Coverage) version(DigitalMars)
26         {
27             import core.runtime : dmd_coverSetMerge;
28             dmd_coverSetMerge(true);
29         }
30 
31         TsvSampleOptions cmdopt;
32         auto r = cmdopt.processArgs(cmdArgs);
33         if (!r[0]) return r[1];
34         version(LDC_Profile)
35         {
36             import ldc.profile : resetAll;
37             resetAll();
38         }
39         try
40         {
41             import tsvutil : BufferedOutputRange;
42             auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
43 
44             if (cmdopt.useStreamSampling)
45             {
46                 streamSampling(cmdopt, bufferedOutput);
47             }
48             else if (cmdopt.useDistinctSampling)
49             {
50                 distinctSampling(cmdopt, bufferedOutput);
51             }
52             else if (cmdopt.sampleSize == 0)
53             {
54                 reservoirSampling!(Yes.permuteAll)(cmdopt, bufferedOutput);
55             }
56             else
57             {
58                 reservoirSampling!(No.permuteAll)(cmdopt, bufferedOutput);
59             }
60         }
61         catch (Exception exc)
62         {
63             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
64             return 1;
65         }
66         return 0;
67     }
68 }
69 
70 auto helpText = q"EOS
71 Synopsis: tsv-sample [options] [file...]
72 
73 Samples or randomizes input lines. There are several modes of operation:
74 * Randomization (Default): Input lines are output in random order.
75 * Stream sampling (--r|rate): Input lines are sampled based on a sampling
76   rate. The order of the input is unchanged.
77 * Distinct sampling (--k|key-fields, --r|rate): Sampling is based on the
78   values in the key field. A portion of the keys are chosen based on the
79   sampling rate (a distinct set). All lines with one of the selected keys
80   are output. Input order is unchanged.
81 * Weighted sampling (--w|weight-field): Input lines are selected using
82   weighted random sampling, with the weight taken from a field. Input
83   lines are output in the order selected, reordering the lines.
84 
85 The '--n|num' option limits the sample sized produced. It speeds up the
86 randomization and weighted sampling cases significantly.
87 
88 Use '--help-verbose' for detailed information.
89 
90 Options:
91 EOS";
92 
93 auto helpTextVerbose = q"EOS
94 Synopsis: tsv-sample [options] [file...]
95 
96 Samples or randomizes input lines. There are several modes of operation:
97 * Randomization (Default): Input lines are output in random order.
98 * Stream sampling (--r|rate): Input lines are sampled based on a sampling
99   rate. The order of the input is unchanged.
100 * Distinct sampling (--k|key-fields, --r|rate): Sampling is based on the
101   values in the key field. A portion of the keys are chosen based on the
102   sampling rate (a distinct set). All lines with one of the selected keys
103   are output. Input order is unchanged.
104 * Weighted sampling (--w|weight-field): Input lines are selected using
105   weighted random sampling, with the weight taken from a field. Input
106   lines are output in the order selected, reordering the lines. See
107   'Weighted sampling' below for info on field weights.
108 
109 Sample size: The '--n|num' option limits the sample sized produced. This
110 speeds up randomization and weighted sampling significantly (details below).
111 
112 Controlling randomization: Each run produces a different randomization.
113 Using '--s|static-seed' changes this so multiple runs produce the same
114 randomization. This works by using the same random seed each run. The
115 random seed can be specified using '--v|seed-value'. This takes a
116 non-zero, 32-bit positive integer. (A zero value is a no-op and ignored.)
117 
118 Generating random weights: The random weight assigned to each line can
119 output using the '--p|print-random' option. This can be used with
120 '--rate 1' to assign a random weight to each line. The random weight
121 is prepended line as field one (separated by TAB or --d|delimiter char).
122 Weights are in the interval [0,1]. The open/closed aspects of the
123 interval (including/excluding 0.0 and 1.0) are subject to change and
124 should not be relied on.
125 
126 Reservoir sampling: The randomization and weighted sampling cases are
127 implemented using reservoir sampling. This means all lines output must be
128 held in memory. Memory needed for large input streams can reduced
129 significantly using a sample size. Both 'tsv-sample -n 1000' and
130 'tsv-sample | head -n 1000' produce the same results, but the former is
131 quite a bit faster.
132 
133 Weighted sampling: Weighted random sampling is done using an algorithm
134 described by Efraimidis and Spirakis. Weights should be positive values
135 representing the relative weight of the entry in the collection. Counts
136 and similar can be used as weights, it is *not* necessary to normalize to
137 a [0,1] interval. Negative values are not meaningful and given the value
138 zero. Input order is not retained, instead lines are output ordered by
139 the randomized weight that was assigned. This means that a smaller valid
140 sample can be produced by taking the first N lines of output. For more
141 info on the sampling approach see:
142 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling
143 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis
144   (https://arxiv.org/abs/1012.0256)
145 
146 Options:
147 EOS";
148 
149 struct TsvSampleOptions
150 {
151     string programName;
152     string[] files;
153     bool helpVerbose = false;         // --help-verbose
154     double sampleRate = double.nan;   // --r|rate - Sampling rate
155     size_t sampleSize = 0;            // --n|num - Size of the desired sample
156     size_t weightField = 0;           // --w|weight-field - Field holding the weight
157     size_t[] keyFields;               // --k|key-fields - Used with sampling rate
158     bool hasHeader = false;           // --H|header
159     bool printRandom = false;         // --p|print-random
160     bool staticSeed = false;          // --s|static-seed
161     uint seedValueOptionArg = 0;      // --v|seed-value
162     char delim = '\t';                // --d|delimiter
163     bool versionWanted = false;       // --V|version
164     bool hasWeightField = false;      // Derived.
165     bool useStreamSampling = false;   // Derived.
166     bool useDistinctSampling = false; // Derived.
167     uint seed = 0;                    // Derived from --static-seed, --seed-value
168 
169     auto processArgs(ref string[] cmdArgs)
170     {
171         import std.getopt;
172         import std.math : isNaN;
173         import std.path : baseName, stripExtension;
174         import std.typecons : Yes, No;
175         import tsvutil : makeFieldListOptionHandler;
176 
177         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
178 
179         try
180         {
181             arraySep = ",";    // Use comma to separate values in command line options
182             auto r = getopt(
183                 cmdArgs,
184                 "help-verbose",    "     Print more detailed help.", &helpVerbose,
185                 std.getopt.config.caseSensitive,
186                 "H|header",        "     Treat the first line of each file as a header.", &hasHeader,
187                 std.getopt.config.caseInsensitive,
188                 "r|rate",          "NUM  Sampling rating (0.0 < NUM <= 1.0). This sampling mode outputs a random fraction of lines, in the input order.", &sampleRate,
189                 "n|num",           "NUM  Number of lines to output. All lines are output if not provided or zero.", &sampleSize,
190                 "w|weight-field",         "NUM  Field containing weights. All lines get equal weight if not provided or zero.", &weightField,
191 
192                 "k|key-fields",    "<field-list>  Fields to use as key for distinct sampling. Use with --r|rate.",
193                 keyFields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex),
194 
195                 "p|print-random",  "     Output the random values that were assigned.", &printRandom,
196                 "s|static-seed",   "     Use the same random seed every run.", &staticSeed,
197 
198                 std.getopt.config.caseSensitive,
199                 "v|seed-value",    "NUM  Sets the initial random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg,
200                 std.getopt.config.caseInsensitive,
201 
202                 "d|delimiter",     "CHR  Field delimiter.", &delim,
203 
204                 std.getopt.config.caseSensitive,
205                 "V|version",       "     Print version information and exit.", &versionWanted,
206                 std.getopt.config.caseInsensitive,
207                 );
208 
209             if (r.helpWanted)
210             {
211                 defaultGetoptPrinter(helpText, r.options);
212                 return tuple(false, 0);
213             }
214             else if (helpVerbose)
215             {
216                 defaultGetoptPrinter(helpTextVerbose, r.options);
217                 return tuple(false, 0);
218             }
219             else if (versionWanted)
220             {
221                 import tsvutils_version;
222                 writeln(tsvutilsVersionNotice("tsv-sample"));
223                 return tuple(false, 0);
224             }
225 
226             /* Derivations and validations. */
227             if (weightField > 0)
228             {
229                 hasWeightField = true;
230                 weightField--;    // Switch to zero-based indexes.
231             }
232 
233             if (keyFields.length > 0 && sampleRate.isNaN)
234             {
235                 throw new Exception("--r|rate is required when using --k|key-fields.");
236             }
237 
238             /* Sample rate (--r|rate) is used for both stream sampling and distinct sampling. */
239             if (!sampleRate.isNaN)
240             {
241                 if (sampleRate <= 0.0 || sampleRate > 1.0)
242                 {
243                     import std.format : format;
244                     throw new Exception(
245                         format("Invalid --r|rate option: %g. Must satisfy 0.0 < rate <= 1.0.", sampleRate));
246                 }
247 
248                 if (hasWeightField) throw new Exception("--w|weight-field and --r|rate cannot be used together.");
249 
250                 if (keyFields.length > 0) useDistinctSampling = true;
251                 else useStreamSampling = true;
252             }
253 
254             /* Seed. */
255             import std.random : unpredictableSeed;
256             seed = (seedValueOptionArg != 0) ? seedValueOptionArg
257                 : staticSeed ? 2438424139
258                 : unpredictableSeed;
259 
260             /* Assume remaining args are files. Use standard input if files were not provided. */
261             files ~= (cmdArgs.length > 1) ? cmdArgs[1..$] : ["-"];
262             cmdArgs.length = 1;
263         }
264         catch (Exception exc)
265         {
266             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
267             return tuple(false, 1);
268         }
269         return tuple(true, 0);
270     }
271 }
272 
273 /* streamSampling does simple bernoulli sampling on the input stream. Each input line
274  * is a assigned a random value and output if less than the sampling rate.
275  *
276  * Note: Performance tests show that skip sampling is faster when the sampling rate
277  * is approximately 4-5% or less. An optimization would be to have separate function
278  * to use when the sampling rate is small and the random weights are not being added
279  * to each line.
280  */
281 void streamSampling(OutputRange)(TsvSampleOptions cmdopt, OutputRange outputStream)
282     if (isOutputRange!(OutputRange, char))
283 {
284     import std.random : Random, uniform01;
285     import tsvutil : throwIfWindowsNewlineOnUnix;
286 
287     auto randomGenerator = Random(cmdopt.seed);
288 
289     /* Process each line. */
290     bool headerWritten = false;
291     size_t numLinesWritten = 0;
292     foreach (filename; cmdopt.files)
293     {
294         auto inputStream = (filename == "-") ? stdin : filename.File();
295         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
296         {
297             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
298             if (fileLineNum == 1 && cmdopt.hasHeader)
299             {
300                 if (!headerWritten)
301                 {
302                     if (cmdopt.printRandom)
303                     {
304                         outputStream.put("random_weight");
305                         outputStream.put(cmdopt.delim);
306                     }
307                     outputStream.put(line);
308                     outputStream.put("\n");
309                     headerWritten = true;
310                 }
311             }
312             else
313             {
314                 double lineScore = uniform01(randomGenerator);
315                 if (lineScore < cmdopt.sampleRate)
316                 {
317                     if (cmdopt.printRandom)
318                     {
319                         import std.format;
320                         outputStream.put(format("%.15g", lineScore));
321                         outputStream.put(cmdopt.delim);
322                     }
323                     outputStream.put(line);
324                     outputStream.put("\n");
325 
326                     if (cmdopt.sampleSize != 0)
327                     {
328                         ++numLinesWritten;
329                         if (numLinesWritten == cmdopt.sampleSize) return;
330                     }
331                 }
332             }
333         }
334     }
335 }
336 
337 /* distinctSampling samples a portion of the unique values from the key fields. This
338  * is done by hashing the key and mapping the hash value into buckets matching the
339  *  sampling rate size. Records having a key mapping to bucket zero are output.
340  */
341 void distinctSampling(OutputRange)(TsvSampleOptions cmdopt, OutputRange outputStream)
342     if (isOutputRange!(OutputRange, char))
343 {
344     import std.algorithm : splitter;
345     import std.conv : to;
346     import std.digest.murmurhash;
347     import std.math : lrint;
348     import tsvutil : InputFieldReordering, throwIfWindowsNewlineOnUnix;
349 
350     assert(cmdopt.keyFields.length > 0);
351     assert(0.0 < cmdopt.sampleRate && cmdopt.sampleRate <= 1.0);
352 
353     immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys.
354 
355     uint numBuckets = (1.0 / cmdopt.sampleRate).lrint.to!uint;
356 
357     /* Create a mapping for the key fields. */
358     auto keyFieldsReordering = new InputFieldReordering!char(cmdopt.keyFields);
359 
360     /* Process each line. */
361     bool headerWritten = false;
362     size_t numLinesWritten = 0;
363     foreach (filename; cmdopt.files)
364     {
365         auto inputStream = (filename == "-") ? stdin : filename.File();
366         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
367         {
368             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
369             if (fileLineNum == 1 && cmdopt.hasHeader)
370             {
371                 if (!headerWritten)
372                 {
373                     outputStream.put(line);
374                     outputStream.put("\n");
375                     headerWritten = true;
376                 }
377             }
378             else
379             {
380                 /* Gather the key field values and assemble the key. */
381                 keyFieldsReordering.initNewLine;
382                 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
383                 {
384                     keyFieldsReordering.processNextField(fieldIndex, fieldValue);
385                     if (keyFieldsReordering.allFieldsFilled) break;
386                 }
387 
388                 if (!keyFieldsReordering.allFieldsFilled)
389                 {
390                     import std.format : format;
391                     throw new Exception(
392                         format("Not enough fields in line. File: %s, Line: %s",
393                                (filename == "-") ? "Standard Input" : filename, fileLineNum));
394                 }
395 
396                 auto hasher = MurmurHash3!32(cmdopt.seed);
397                 foreach (count, key; keyFieldsReordering.outputFields.enumerate)
398                 {
399                     if (count > 0) hasher.put(delimArray);
400                     hasher.put(cast(ubyte[]) key);
401                 }
402                 hasher.finish;
403                 if (hasher.get % numBuckets == 0)
404                 {
405                     outputStream.put(line);
406                     outputStream.put("\n");
407 
408                     if (cmdopt.sampleSize != 0)
409                     {
410                         ++numLinesWritten;
411                         if (numLinesWritten == cmdopt.sampleSize) return;
412                     }
413                 }
414             }
415         }
416     }
417 }
418 
419 /* An implementation of reservior sampling. Both weighted and unweighted sampling are
420  * supported. Both are implemented using the one-pass algorithm described by Efraimidis
421  * and Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis,
422  * https://arxiv.org/abs/1012.0256). In the unweighted case weights are simply set to one.
423  *
424  * Both sampling and full permutation of the input are supported, but the implementations
425  * differ. Both use a heap (priority queue). A "max" heap is used when permuting all lines,
426  * as it leaves the heap in the correct order for output. However, a "min" heap is used
427  * when sampling. When sampling the case the role of the heap is to indentify the top-k
428  * elements. Adding a new items means dropping the "min" item. When done reading all lines,
429  * the "min" heap is in the opposite order needed for output. The desired order is obtained
430  * by removing each element one at at time from the heap. The underlying data store will
431  * have the elements in correct order. The other notable difference is that the backing
432  * store can be pre-allocated when sampling, but must be grown when permuting all lines.
433  */
434 void reservoirSampling(Flag!"permuteAll" permuteAll, OutputRange)
435     (TsvSampleOptions cmdopt, OutputRange outputStream)
436     if (isOutputRange!(OutputRange, char))
437 {
438     import std.random : Random, uniform01;
439     import std.container.binaryheap;
440     import tsvutil : throwIfWindowsNewlineOnUnix;
441 
442     /* Ensure the correct version of the template was called. */
443     static if (permuteAll) assert(cmdopt.sampleSize == 0);
444     else assert(cmdopt.sampleSize > 0);
445 
446     auto randomGenerator = Random(cmdopt.seed);
447 
448     struct Entry
449     {
450         double score;
451         char[] line;
452     }
453 
454     /* Create the heap and backing data store. A min or max heap is used as described
455      * above. The backing store has some complications resulting from the current
456      * standard library implementation:
457      * - Built-in arrays appear to have better memory bevavior when appending than
458      *   std.container.array Arrays. However, built-in arrays cannot be used with
459      *   binaryheaps until Phobos version 2.072.
460      * - std.container.array Arrays with pre-allocated storage can be used to
461      *   efficiently reverse the heap, but a bug prevents this from working for other
462      *   data store use cases. Info: https://issues.dlang.org/show_bug.cgi?id=17094
463      * - Result: Use a built-in array if request is for permuteAll and Phobos version
464      *   is 2.072 or later. Otherwise use a std.container.array Array.
465      */
466 
467     static if (permuteAll && __VERSION__ >= 2072)
468     {
469         Entry[] dataStore;
470     }
471     else
472     {
473         import std.container.array;
474         Array!Entry dataStore;
475     }
476 
477     dataStore.reserve(cmdopt.sampleSize);
478 
479     static if (permuteAll)
480     {
481         auto reservoir = dataStore.heapify!("a.score < b.score")(0);  // Max binaryheap
482     }
483     else
484     {
485         auto reservoir = dataStore.heapify!("a.score > b.score")(0);  // Min binaryheap
486     }
487 
488     /* Process each line. */
489     bool headerWritten = false;
490     foreach (filename; cmdopt.files)
491     {
492         auto inputStream = (filename == "-") ? stdin : filename.File();
493         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
494         {
495             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
496             if (fileLineNum == 1 && cmdopt.hasHeader)
497             {
498                 if (!headerWritten)
499                 {
500                     if (cmdopt.printRandom)
501                     {
502                         outputStream.put("random_weight");
503                         outputStream.put(cmdopt.delim);
504                     }
505                     outputStream.put(line);
506                     outputStream.put("\n");
507                     headerWritten = true;
508                 }
509             }
510             else
511             {
512                 double lineWeight =
513                     cmdopt.hasWeightField
514                     ? getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, filename, fileLineNum)
515                     : 1.0;
516                 double lineScore =
517                     (lineWeight > 0.0)
518                     ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
519                     : 0.0;
520 
521                 static if (permuteAll)
522                 {
523                     reservoir.insert(Entry(lineScore, line.dup));
524                 }
525                 else
526                 {
527                     if (reservoir.length < cmdopt.sampleSize)
528                     {
529                         reservoir.insert(Entry(lineScore, line.dup));
530                     }
531                     else if (reservoir.front.score < lineScore)
532                     {
533                         reservoir.replaceFront(Entry(lineScore, line.dup));
534                     }
535                 }
536             }
537         }
538     }
539 
540     /* All entries are in the reservoir. Time to print. Entries are printed ordered
541      * by assigned weights. In the sampling/top-k cases this could sped up a little
542      * by simply printing the backing store array. However, there is real value in
543      * having a weighted order. This is especially true for weighted sampling, but
544      * there is also value in the unweighted case, especially when using static seeds.
545      */
546 
547     void printEntry(Entry entry)
548     {
549         if (cmdopt.printRandom)
550         {
551             import std.format;
552             outputStream.put(format("%.15g", entry.score));
553             outputStream.put(cmdopt.delim);
554         }
555         outputStream.put(entry.line);
556         outputStream.put("\n");
557     }
558 
559     static if (permuteAll)
560     {
561         foreach (entry; reservoir) printEntry(entry);  // Walk the max-heap
562     }
563     else
564     {
565         /* Sampling/top-n case: Reorder the data store by extracting all the elements.
566          * Note: Asserts are chosen to avoid issues in the current binaryheap implementation.
567          */
568         size_t numLines = reservoir.length;
569         assert(numLines == dataStore.length);
570 
571         while (!reservoir.empty) reservoir.removeFront;
572         assert(numLines == dataStore.length);
573         foreach (entry; dataStore) printEntry(entry);
574     }
575 }
576 
577 /* A convenience function for extracting a single field from a line. See getTsvFieldValue in
578  * common/src/tsvutils.d for details. This wrapper creates error text tailored for this program.
579  */
580 import std.traits : isSomeChar;
581 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, size_t lineNum)
582     pure @safe
583     if (isSomeChar!C)
584 {
585     import std.conv : ConvException, to;
586     import std.format : format;
587     import tsvutil : getTsvFieldValue;
588 
589     T val;
590     try
591     {
592         val = getTsvFieldValue!T(line, fieldIndex, delim);
593     }
594     catch (ConvException exc)
595     {
596         throw new Exception(
597             format("Could not process line: %s\n  File: %s Line: %s%s",
598                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
599                    (lineNum == 1) ? "\n  Is this a header line? Use --H|header to skip." : ""));
600     }
601     catch (Exception exc)
602     {
603         /* Not enough fields on the line. */
604         throw new Exception(
605             format("Could not process line: %s\n  File: %s Line: %s",
606                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum));
607     }
608 
609     return val;
610 }
611 
612 unittest
613 {
614     /* getFieldValue unit tests. getTsvFieldValue has it's own tests.
615      * These tests make basic sanity checks on the getFieldValue wrapper.
616      */
617     import std.exception;
618 
619     assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123);
620     assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4);
621     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1));
622     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2));
623     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1));
624     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2));
625 }
626 
627 /* Unit tests for the main program start here.
628  *
629  * Portability note: Many of the tests here rely on generating consistent random numbers
630  * across different platforms when using the same random seed. So far this has succeeded
631  * on several different platorm, compiler, and library versions. However, it is certainly
632  * possible this condition will not hold on other platforms.
633  *
634  * For tsv-sample, this portability implies generating the same results on different
635  * platforms when using the same random seed. This is NOT part of tsv-sample guarantees,
636  * but it is convenient for testing. If platforms are identified that do not generate
637  * the same results these tests will need to be adjusted.
638  */
639 version(unittest)
640 {
641     /* Unit test helper functions. */
642 
643     import unittest_utils;   // tsv unit test helpers, from common/src/.
644     import std.conv : to;
645 
646     void testTsvSample(string[] cmdArgs, string[][] expected)
647     {
648         import std.array : appender;
649         import std.format : format;
650 
651         assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty.");
652 
653         auto formatAssertMessage(T...)(string msg, T formatArgs)
654         {
655             auto formatString = "[testTsvSample] %s: " ~ msg;
656             return format(formatString, cmdArgs[0], formatArgs);
657         }
658 
659         TsvSampleOptions cmdopt;
660         auto savedCmdArgs = cmdArgs.to!string;
661         auto r = cmdopt.processArgs(cmdArgs);
662         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
663         auto output = appender!(char[])();
664 
665         if (cmdopt.useDistinctSampling)
666         {
667             distinctSampling(cmdopt, output);
668         }
669         else if (cmdopt.useStreamSampling)
670         {
671             streamSampling(cmdopt, output);
672         }
673         else if (cmdopt.sampleSize == 0)
674         {
675             reservoirSampling!(Yes.permuteAll)(cmdopt, output);
676         }
677         else
678         {
679             reservoirSampling!(No.permuteAll)(cmdopt, output);
680         }
681 
682         auto expectedOutput = expected.tsvDataToString;
683 
684         assert(output.data == expectedOutput,
685                formatAssertMessage(
686                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
687                    expectedOutput.to!string, output.data.to!string));
688     }
689  }
690 
691 unittest
692 {
693     import std.path : buildPath;
694     import std.file : rmdirRecurse;
695     import std.format : format;
696 
697     auto testDir = makeUnittestTempDir("tsv_sample");
698     scope(exit) testDir.rmdirRecurse;
699 
700     /* Tabular data sets and expected results use the built-in static seed.
701      * Tests are run by writing the data set to a file, then calling the main
702      * routine to process. The function testTsvSample plays the role of the
703      * main program. Rather than writing to expected output, the results are
704      * matched against expected. The expected results were verified by hand
705      * prior to inclusion in the test.
706      *
707      * The initial part of this section is simply setting up data files and
708      * expected results.
709      */
710 
711     /* Empty file. */
712     string[][] dataEmpty = [];
713     string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv");
714     writeUnittestTsvFile(fpath_dataEmpty, dataEmpty);
715 
716     /* 3x1, header only. */
717     string[][] data3x0 = [["field_a", "field_b", "field_c"]];
718     string fpath_data3x0 = buildPath(testDir, "data3x0.tsv");
719     writeUnittestTsvFile(fpath_data3x0, data3x0);
720 
721     /* 3x1 */
722     string[][] data3x1 =
723         [["field_a", "field_b", "field_c"],
724          ["tan", "タン", "8.5"]];
725 
726     string fpath_data3x1 = buildPath(testDir, "data3x1.tsv");
727     string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv");
728     writeUnittestTsvFile(fpath_data3x1, data3x1);
729     writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1..$]);
730 
731     string[][] data3x2 =
732         [["field_a", "field_b", "field_c"],
733          ["brown", "褐色", "29.2"],
734          ["gray", "グレー", "6.2"]];
735 
736     /* 3x2 */
737     string fpath_data3x2 = buildPath(testDir, "data3x2.tsv");
738     string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv");
739     writeUnittestTsvFile(fpath_data3x2, data3x2);
740     writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1..$]);
741 
742     string[][] data3x2ExpectedNoWt =
743         [["field_a", "field_b", "field_c"],
744          ["gray", "グレー", "6.2"],
745          ["brown", "褐色", "29.2"]];
746 
747     /* 3x3 */
748     string[][] data3x3 =
749         [["field_a", "field_b", "field_c"],
750          ["orange", "オレンジ", "2.5"],
751          ["pink", "ピンク", "1.1"],
752          ["purple", "紫の", "42"]];
753 
754     string fpath_data3x3 = buildPath(testDir, "data3x3.tsv");
755     string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv");
756     writeUnittestTsvFile(fpath_data3x3, data3x3);
757     writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1..$]);
758 
759     string[][] data3x3ExpectedNoWt =
760         [["field_a", "field_b", "field_c"],
761          ["purple", "紫の", "42"],
762          ["pink", "ピンク", "1.1"],
763          ["orange", "オレンジ", "2.5"]];
764 
765     /* 3x6 */
766     string[][] data3x6 =
767         [["field_a", "field_b", "field_c"],
768          ["red", "赤", "23.8"],
769          ["green", "緑", "0.0072"],
770          ["white", "白", "1.65"],
771          ["yellow", "黄", "12"],
772          ["blue", "青", "12"],
773          ["black", "黒", "0.983"]];
774     string fpath_data3x6 = buildPath(testDir, "data3x6.tsv");
775     string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv");
776     writeUnittestTsvFile(fpath_data3x6, data3x6);
777     writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1..$]);
778 
779     string[][] data3x6ExpectedNoWt =
780         [["field_a", "field_b", "field_c"],
781          ["yellow", "黄", "12"],
782          ["black", "黒", "0.983"],
783          ["blue", "青", "12"],
784          ["white", "白", "1.65"],
785          ["green", "緑", "0.0072"],
786          ["red", "赤", "23.8"]];
787 
788     string[][] data3x6ExpectedNoWtProbs =
789         [["random_weight", "field_a", "field_b", "field_c"],
790          ["0.960555462865159", "yellow", "黄", "12"],
791          ["0.757101539289579", "black", "黒", "0.983"],
792          ["0.525259808870032", "blue", "青", "12"],
793          ["0.492878549499437", "white", "白", "1.65"],
794          ["0.159293440869078", "green", "緑", "0.0072"],
795          ["0.010968807619065", "red", "赤", "23.8"]];
796 
797     string[][] data3x6ExpectedProbsStreamSampleP100 =
798         [["random_weight", "field_a", "field_b", "field_c"],
799          ["0.010968807619065", "red", "赤", "23.8"],
800          ["0.159293440869078", "green", "緑", "0.0072"],
801          ["0.492878549499437", "white", "白", "1.65"],
802          ["0.960555462865159", "yellow", "黄", "12"],
803          ["0.525259808870032", "blue", "青", "12"],
804          ["0.757101539289579", "black", "黒", "0.983"]];
805 
806     string[][] data3x6ExpectedProbsStreamSampleP60 =
807         [["random_weight", "field_a", "field_b", "field_c"],
808          ["0.010968807619065", "red", "赤", "23.8"],
809          ["0.159293440869078", "green", "緑", "0.0072"],
810          ["0.492878549499437", "white", "白", "1.65"],
811          ["0.525259808870032", "blue", "青", "12"]];
812 
813     string[][] data3x6ExpectedStreamSampleP60 =
814         [["field_a", "field_b", "field_c"],
815          ["red", "赤", "23.8"],
816          ["green", "緑", "0.0072"],
817          ["white", "白", "1.65"],
818          ["blue", "青", "12"]];
819 
820     string[][] data3x6ExpectedDistinctSampleK1K3P60 =
821         [["field_a", "field_b", "field_c"],
822          ["green", "緑", "0.0072"],
823          ["white", "白", "1.65"],
824          ["blue", "青", "12"]];
825 
826     string[][] data3x6ExpectedWt3Probs =
827         [["random_weight", "field_a", "field_b", "field_c"],
828          ["0.996651987576454", "yellow", "黄", "12"],
829          ["0.947758848098367", "blue", "青", "12"],
830          ["0.827282346822867", "red", "赤", "23.8"],
831          ["0.75346697377182", "black", "黒", "0.983"],
832          ["0.651301034964225", "white", "白", "1.65"],
833          ["1.56369437128799e-111", "green", "緑", "0.0072"]];
834 
835     string[][] data3x6ExpectedWt3 =
836         [["field_a", "field_b", "field_c"],
837          ["yellow", "黄", "12"],
838          ["blue", "青", "12"],
839          ["red", "赤", "23.8"],
840          ["black", "黒", "0.983"],
841          ["white", "白", "1.65"],
842          ["green", "緑", "0.0072"]];
843 
844     /* Using a different static seed. */
845     string[][] data3x6ExpectedNoWtV41Probs =
846         [["random_weight", "field_a", "field_b", "field_c"],
847          ["0.680572726530954", "green", "緑", "0.0072"],
848          ["0.676816243678331", "blue", "青", "12"],
849          ["0.32097338931635", "yellow", "黄", "12"],
850          ["0.250923618674278", "red", "赤", "23.8"],
851          ["0.155359342927113", "black", "黒", "0.983"],
852          ["0.0460958210751414", "white", "白", "1.65"]];
853 
854     string[][] data3x6ExpectedV41ProbsStreamSampleP60 =
855         [["random_weight", "field_a", "field_b", "field_c"],
856          ["0.250923618674278", "red", "赤", "23.8"],
857          ["0.0460958210751414", "white", "白", "1.65"],
858          ["0.32097338931635", "yellow", "黄", "12"],
859          ["0.155359342927113", "black", "黒", "0.983"]];
860 
861     string[][] data3x6ExpectedWt3V41Probs =
862         [["random_weight", "field_a", "field_b", "field_c"],
863          ["0.967993774989107", "blue", "青", "12"],
864          ["0.943562457925736", "red", "赤", "23.8"],
865          ["0.90964601024272", "yellow", "黄", "12"],
866          ["0.154916584092601", "white", "白", "1.65"],
867          ["0.15043620392537", "black", "黒", "0.983"],
868          ["6.13946748307015e-24", "green", "緑", "0.0072"]];
869 
870 
871     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
872     string[][] combo1ExpectedNoWt =
873         [["field_a", "field_b", "field_c"],
874          ["yellow", "黄", "12"],
875          ["tan", "タン", "8.5"],
876          ["brown", "褐色", "29.2"],
877          ["green", "緑", "0.0072"],
878          ["red", "赤", "23.8"],
879          ["purple", "紫の", "42"],
880          ["black", "黒", "0.983"],
881          ["white", "白", "1.65"],
882          ["gray", "グレー", "6.2"],
883          ["blue", "青", "12"],
884          ["pink", "ピンク", "1.1"],
885          ["orange", "オレンジ", "2.5"]];
886 
887     string[][] combo1ExpectedNoWtProbs =
888         [["random_weight", "field_a", "field_b", "field_c"],
889          ["0.970885202754289", "yellow", "黄", "12"],
890          ["0.960555462865159", "tan", "タン", "8.5"],
891          ["0.817568943137303", "brown", "褐色", "29.2"],
892          ["0.757101539289579", "green", "緑", "0.0072"],
893          ["0.525259808870032", "red", "赤", "23.8"],
894          ["0.492878549499437", "purple", "紫の", "42"],
895          ["0.470815070671961", "black", "黒", "0.983"],
896          ["0.383881829213351", "white", "白", "1.65"],
897          ["0.292159906122833", "gray", "グレー", "6.2"],
898          ["0.240332160145044", "blue", "青", "12"],
899          ["0.159293440869078", "pink", "ピンク", "1.1"],
900          ["0.010968807619065", "orange", "オレンジ", "2.5"]];
901 
902     string[][] combo1ExpectedProbsStreamSampleP50 =
903         [["random_weight", "field_a", "field_b", "field_c"],
904          ["0.010968807619065", "orange", "オレンジ", "2.5"],
905          ["0.159293440869078", "pink", "ピンク", "1.1"],
906          ["0.492878549499437", "purple", "紫の", "42"],
907          ["0.383881829213351", "white", "白", "1.65"],
908          ["0.240332160145044", "blue", "青", "12"],
909          ["0.470815070671961", "black", "黒", "0.983"],
910          ["0.292159906122833", "gray", "グレー", "6.2"]];
911 
912     string[][] combo1ExpectedStreamSampleP40 =
913         [["field_a", "field_b", "field_c"],
914          ["orange", "オレンジ", "2.5"],
915          ["pink", "ピンク", "1.1"],
916          ["white", "白", "1.65"],
917          ["blue", "青", "12"],
918          ["gray", "グレー", "6.2"]];
919 
920     string[][] combo1ExpectedDistinctSampleK1P40 =
921         [["field_a", "field_b", "field_c"],
922          ["orange", "オレンジ", "2.5"],
923          ["red", "赤", "23.8"],
924          ["green", "緑", "0.0072"],
925          ["blue", "青", "12"],
926          ["black", "黒", "0.983"]];
927 
928     string[][] combo1ExpectedWt3Probs =
929         [["random_weight", "field_a", "field_b", "field_c"],
930          ["0.997540775237188", "yellow", "黄", "12"],
931          ["0.995276654400888", "tan", "タン", "8.5"],
932          ["0.993125789457417", "brown", "褐色", "29.2"],
933          ["0.983296025533894", "purple", "紫の", "42"],
934          ["0.973309619380837", "red", "赤", "23.8"],
935          ["0.887975515217396", "blue", "青", "12"],
936          ["0.819992304890418", "gray", "グレー", "6.2"],
937          ["0.559755692042509", "white", "白", "1.65"],
938          ["0.464721356092057", "black", "黒", "0.983"],
939          ["0.188245827041913", "pink", "ピンク", "1.1"],
940          ["0.164461318532999", "orange", "オレンジ", "2.5"],
941          ["1.64380869310205e-17", "green", "緑", "0.0072"]];
942 
943     string[][] combo1ExpectedWt3 =
944         [["field_a", "field_b", "field_c"],
945          ["yellow", "黄", "12"],
946          ["tan", "タン", "8.5"],
947          ["brown", "褐色", "29.2"],
948          ["purple", "紫の", "42"],
949          ["red", "赤", "23.8"],
950          ["blue", "青", "12"],
951          ["gray", "グレー", "6.2"],
952          ["white", "白", "1.65"],
953          ["black", "黒", "0.983"],
954          ["pink", "ピンク", "1.1"],
955          ["orange", "オレンジ", "2.5"],
956          ["green", "緑", "0.0072"]];
957 
958     /* 1x10 - Simple 1-column file. */
959     string[][] data1x10 =
960         [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]];
961     string fpath_data1x10 = buildPath(testDir, "data1x10.tsv");
962     string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv");
963     writeUnittestTsvFile(fpath_data1x10, data1x10);
964     writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1..$]);
965 
966     string[][] data1x10ExpectedNoWt =
967         [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]];
968 
969     string[][] data1x10ExpectedWt1 =
970         [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]];
971 
972     /* 2x10a - Uniform distribution [0,1]. */
973     string[][] data2x10a =
974         [["line", "weight"],
975          ["1", "0.26788837"],
976          ["2", "0.06601298"],
977          ["3", "0.38627527"],
978          ["4", "0.47379424"],
979          ["5", "0.02966641"],
980          ["6", "0.05636231"],
981          ["7", "0.70529242"],
982          ["8", "0.91836862"],
983          ["9", "0.99103720"],
984          ["10", "0.31401740"]];
985 
986     string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv");
987     writeUnittestTsvFile(fpath_data2x10a, data2x10a);
988 
989     string[][] data2x10aExpectedWt2Probs =
990         [["random_weight", "line", "weight"],
991          ["0.968338654945437", "8", "0.91836862"],
992          ["0.918568420544139", "4", "0.47379424"],
993          ["0.257308320877951", "7", "0.70529242"],
994          ["0.237253179070181", "9", "0.99103720"],
995          ["0.160160967018722", "3", "0.38627527"],
996          ["0.0908196626672434", "10", "0.31401740"],
997          ["0.00717645392443612", "6", "0.05636231"],
998          ["4.83186429516301e-08", "1", "0.26788837"],
999          ["3.75256929665355e-10", "5", "0.02966641"],
1000          ["8.21232478800958e-13", "2", "0.06601298"]];
1001 
1002     /* 2x10b - Uniform distribution [0,1000]. */
1003     string[][] data2x10b =
1004         [["line", "weight"],
1005          ["1", "761"],
1006          ["2", "432"],
1007          ["3", "103"],
1008          ["4", "448"],
1009          ["5", "750"],
1010          ["6", "711"],
1011          ["7", "867"],
1012          ["8", "841"],
1013          ["9", "963"],
1014          ["10", "784"]];
1015 
1016     string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv");
1017     writeUnittestTsvFile(fpath_data2x10b, data2x10b);
1018 
1019     string[][] data2x10bExpectedWt2Probs =
1020         [["random_weight", "line", "weight"],
1021          ["0.99996486739068", "8", "841"],
1022          ["0.999910174671372", "4", "448"],
1023          ["0.999608715248737", "6", "711"],
1024          ["0.999141885371438", "5", "750"],
1025          ["0.999039632502748", "10", "784"],
1026          ["0.998896318259319", "7", "867"],
1027          ["0.998520583151911", "9", "963"],
1028          ["0.995756696791589", "2", "432"],
1029          ["0.994087587320506", "1", "761"],
1030          ["0.993154677612124", "3", "103"]];
1031 
1032     /* 2x10c - Logarithmic distribution in random order. */
1033     string[][] data2x10c =
1034         [["line", "weight"],
1035          ["1", "31.85"],
1036          ["2", "17403.31"],
1037          ["3", "653.84"],
1038          ["4", "8.23"],
1039          ["5", "2671.04"],
1040          ["6", "26226.08"],
1041          ["7", "1.79"],
1042          ["8", "354.56"],
1043          ["9", "35213.81"],
1044          ["10", "679.29"]];
1045 
1046     string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv");
1047     writeUnittestTsvFile(fpath_data2x10c, data2x10c);
1048 
1049     string[][] data2x10cExpectedWt2Probs =
1050         [["random_weight", "line", "weight"],
1051          ["0.999989390087097", "6", "26226.08"],
1052          ["0.999959512916955", "9", "35213.81"],
1053          ["0.999916669076135", "8", "354.56"],
1054          ["0.999894450521864", "2", "17403.31"],
1055          ["0.999758976028616", "5", "2671.04"],
1056          ["0.998918527698776", "3", "653.84"],
1057          ["0.998891677527825", "10", "679.29"],
1058          ["0.995122075068501", "4", "8.23"],
1059          ["0.86789371584259", "1", "31.85"],
1060          ["0.585744381629156", "7", "1.79"]];
1061 
1062     /* 2x10d. Logarithmic distribution in ascending order. */
1063     string[][] data2x10d =
1064         [["line", "weight"],
1065          ["1", "1.79"],
1066          ["2", "8.23"],
1067          ["3", "31.85"],
1068          ["4", "354.56"],
1069          ["5", "653.84"],
1070          ["6", "679.29"],
1071          ["7", "2671.04"],
1072          ["8", "17403.31"],
1073          ["9", "26226.08"],
1074          ["10", "35213.81"]];
1075 
1076     string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv");
1077     writeUnittestTsvFile(fpath_data2x10d, data2x10d);
1078 
1079     string[][] data2x10dExpectedWt2Probs =
1080         [["random_weight", "line", "weight"],
1081          ["0.999998302218464", "8", "17403.31"],
1082          ["0.999978608340414", "10", "35213.81"],
1083          ["0.999945638289867", "9", "26226.08"],
1084          ["0.999886503635757", "4", "354.56"],
1085          ["0.999641619391901", "7", "2671.04"],
1086          ["0.999590453389486", "6", "679.29"],
1087          ["0.999015744906398", "5", "653.84"],
1088          ["0.978031633047474", "3", "31.85"],
1089          ["0.799947918069109", "2", "8.23"],
1090          ["0.0803742612399491", "1", "1.79"]];
1091 
1092     /* 2x10e. Logarithmic distribution in descending order. */
1093     string[][] data2x10e =
1094         [["line", "weight"],
1095          ["1", "35213.81"],
1096          ["2", "26226.08"],
1097          ["3", "17403.31"],
1098          ["4", "2671.04"],
1099          ["5", "679.29"],
1100          ["6", "653.84"],
1101          ["7", "354.56"],
1102          ["8", "31.85"],
1103          ["9", "8.23"],
1104          ["10", "1.79"]];
1105     string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv");
1106     writeUnittestTsvFile(fpath_data2x10e, data2x10e);
1107 
1108     string[][] data2x10eExpectedWt2Probs =
1109         [["random_weight", "line", "weight"],
1110          ["0.999984933489752", "4", "2671.04"],
1111          ["0.999959348072026", "3", "17403.31"],
1112          ["0.999929957397275", "2", "26226.08"],
1113          ["0.999871856792456", "1", "35213.81"],
1114          ["0.999574515631739", "6", "653.84"],
1115          ["0.999072736502096", "8", "31.85"],
1116          ["0.999052603129689", "5", "679.29"],
1117          ["0.997303336505164", "7", "354.56"],
1118          ["0.840939024352278", "9", "8.23"],
1119          ["0.6565001592629", "10", "1.79"]];
1120 
1121     /* Data sets for distinct sampling. */
1122     string[][] data5x25 =
1123         [["ID", "Shape", "Color", "Size", "Weight"],
1124          ["01", "circle", "red", "S", "10"],
1125          ["02", "circle", "black", "L", "20"],
1126          ["03", "square", "black", "L", "20"],
1127          ["04", "circle", "green", "L", "30"],
1128          ["05", "ellipse", "red", "S", "20"],
1129          ["06", "triangle", "red", "S", "10"],
1130          ["07", "triangle", "red", "L", "20"],
1131          ["08", "square", "black", "S", "10"],
1132          ["09", "circle", "black", "S", "20"],
1133          ["10", "square", "green", "L", "20"],
1134          ["11", "triangle", "red", "L", "20"],
1135          ["12", "circle", "green", "L", "30"],
1136          ["13", "ellipse", "red", "S", "20"],
1137          ["14", "circle", "green", "L", "30"],
1138          ["15", "ellipse", "red", "L", "30"],
1139          ["16", "square", "red", "S", "10"],
1140          ["17", "circle", "black", "L", "20"],
1141          ["18", "square", "red", "S", "20"],
1142          ["19", "square", "black", "L", "20"],
1143          ["20", "circle", "red", "S", "10"],
1144          ["21", "ellipse", "black", "L", "30"],
1145          ["22", "triangle", "red", "L", "30"],
1146          ["23", "circle", "green", "S", "20"],
1147          ["24", "square", "green", "L", "20"],
1148          ["25", "circle", "red", "S", "10"],
1149             ];
1150 
1151     string fpath_data5x25 = buildPath(testDir, "data5x25.tsv");
1152     string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv");
1153     writeUnittestTsvFile(fpath_data5x25, data5x25);
1154     writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1..$]);
1155 
1156     string[][] data5x25ExpectedDistinctSampleK2P40 =
1157         [["ID", "Shape", "Color", "Size", "Weight"],
1158          ["03", "square", "black", "L", "20"],
1159          ["05", "ellipse", "red", "S", "20"],
1160          ["08", "square", "black", "S", "10"],
1161          ["10", "square", "green", "L", "20"],
1162          ["13", "ellipse", "red", "S", "20"],
1163          ["15", "ellipse", "red", "L", "30"],
1164          ["16", "square", "red", "S", "10"],
1165          ["18", "square", "red", "S", "20"],
1166          ["19", "square", "black", "L", "20"],
1167          ["21", "ellipse", "black", "L", "30"],
1168          ["24", "square", "green", "L", "20"],
1169             ];
1170 
1171     string[][] data5x25ExpectedDistinctSampleK2K4P20 =
1172         [["ID", "Shape", "Color", "Size", "Weight"],
1173          ["03", "square", "black", "L", "20"],
1174          ["07", "triangle", "red", "L", "20"],
1175          ["08", "square", "black", "S", "10"],
1176          ["10", "square", "green", "L", "20"],
1177          ["11", "triangle", "red", "L", "20"],
1178          ["16", "square", "red", "S", "10"],
1179          ["18", "square", "red", "S", "20"],
1180          ["19", "square", "black", "L", "20"],
1181          ["22", "triangle", "red", "L", "30"],
1182          ["24", "square", "green", "L", "20"],
1183             ];
1184 
1185     string[][] data5x25ExpectedDistinctSampleK2K3K4P20 =
1186         [["ID", "Shape", "Color", "Size", "Weight"],
1187          ["04", "circle", "green", "L", "30"],
1188          ["07", "triangle", "red", "L", "20"],
1189          ["09", "circle", "black", "S", "20"],
1190          ["11", "triangle", "red", "L", "20"],
1191          ["12", "circle", "green", "L", "30"],
1192          ["14", "circle", "green", "L", "30"],
1193          ["16", "square", "red", "S", "10"],
1194          ["18", "square", "red", "S", "20"],
1195          ["22", "triangle", "red", "L", "30"],
1196             ];
1197 
1198     /*
1199      * Enough setup! Actually run some tests!
1200      */
1201 
1202     /* Basic tests. Headers and static seed. With weights and without. */
1203     testTsvSample(["test-a1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty);
1204     testTsvSample(["test-a2", "--header", "--static-seed", fpath_data3x0], data3x0);
1205     testTsvSample(["test-a3", "-H", "-s", fpath_data3x1], data3x1);
1206     testTsvSample(["test-a4", "-H", "-s", fpath_data3x2], data3x2ExpectedNoWt);
1207     testTsvSample(["test-a5", "-H", "-s", fpath_data3x3], data3x3ExpectedNoWt);
1208     testTsvSample(["test-a6", "-H", "-s", fpath_data3x6], data3x6ExpectedNoWt);
1209     testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedNoWtProbs);
1210     testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedWt3);
1211     testTsvSample(["test-a9", "-H", "-s", "-p", "-w", "3", fpath_data3x6], data3x6ExpectedWt3Probs);
1212     testTsvSample(["test-a10", "-H", "--seed-value", "41", "-p", fpath_data3x6], data3x6ExpectedNoWtV41Probs);
1213     testTsvSample(["test-a11", "-H", "-s", "-v", "41", "-p", fpath_data3x6], data3x6ExpectedNoWtV41Probs);
1214     testTsvSample(["test-a12", "-H", "-s", "-v", "0", "-p", fpath_data3x6], data3x6ExpectedNoWtProbs);
1215     testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "-p", fpath_data3x6], data3x6ExpectedWt3V41Probs);
1216 
1217     /* Stream sampling cases. */
1218     testTsvSample(["test-a14", "--header", "--static-seed", "--rate", "0.001", fpath_dataEmpty], dataEmpty);
1219     testTsvSample(["test-a15", "--header", "--static-seed", "--rate", "0.001", fpath_data3x0], data3x0);
1220     testTsvSample(["test-a16", "-H", "-s", "-r", "1.0", fpath_data3x1], data3x1);
1221     testTsvSample(["test-a17", "-H", "-s", "-r", "1.0", fpath_data3x6], data3x6);
1222     testTsvSample(["test-a18", "-H", "-r", "1.0", fpath_data3x6], data3x6);
1223     testTsvSample(["test-a19", "-H", "-s", "--rate", "1.0", "-p", fpath_data3x6], data3x6ExpectedProbsStreamSampleP100);
1224     testTsvSample(["test-a20", "-H", "-s", "--rate", "0.60", "-p", fpath_data3x6], data3x6ExpectedProbsStreamSampleP60);
1225     testTsvSample(["test-a21", "-H", "-s", "--rate", "0.60", fpath_data3x6], data3x6ExpectedStreamSampleP60);
1226     testTsvSample(["test-a22", "-H", "-v", "41", "--rate", "0.60", "-p", fpath_data3x6], data3x6ExpectedV41ProbsStreamSampleP60);
1227 
1228     /* Distinct sampling cases. */
1229     testTsvSample(["test-a23", "--header", "--static-seed", "--rate", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty);
1230     testTsvSample(["test-a24", "--header", "--static-seed", "--rate", "0.001", "--key-fields", "1", fpath_data3x0], data3x0);
1231     testTsvSample(["test-a25", "-H", "-s", "-r", "1.0", "-k", "2", fpath_data3x1], data3x1);
1232     testTsvSample(["test-a26", "-H", "-s", "-r", "1.0", "-k", "2", fpath_data3x6], data3x6);
1233 
1234     /* Basic tests, without headers. */
1235     testTsvSample(["test-b1", "-s", fpath_data3x1_noheader], data3x1[1..$]);
1236     testTsvSample(["test-b2", "-s", fpath_data3x2_noheader], data3x2ExpectedNoWt[1..$]);
1237     testTsvSample(["test-b3", "-s", fpath_data3x3_noheader], data3x3ExpectedNoWt[1..$]);
1238     testTsvSample(["test-b4", "-s", fpath_data3x6_noheader], data3x6ExpectedNoWt[1..$]);
1239     testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedNoWtProbs[1..$]);
1240     testTsvSample(["test-b6", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3[1..$]);
1241     testTsvSample(["test-b7", "-s", "-p", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedWt3Probs[1..$]);
1242     testTsvSample(["test-b8", "-v", "41", "-p", fpath_data3x6_noheader], data3x6ExpectedNoWtV41Probs[1..$]);
1243     testTsvSample(["test-b9", "-v", "41", "-w", "3", "-p", fpath_data3x6_noheader], data3x6ExpectedWt3V41Probs[1..$]);
1244 
1245     /* Stream sampling cases. */
1246     testTsvSample(["test-b10", "-s", "-r", "1.0", fpath_data3x1_noheader], data3x1[1..$]);
1247     testTsvSample(["test-b11", "-s", "-r", "1.0", fpath_data3x6_noheader], data3x6[1..$]);
1248     testTsvSample(["test-b12", "-r", "1.0", fpath_data3x6_noheader], data3x6[1..$]);
1249     testTsvSample(["test-b13", "-s", "--rate", "1.0", "-p", fpath_data3x6_noheader], data3x6ExpectedProbsStreamSampleP100[1..$]);
1250     testTsvSample(["test-b14", "-s", "--rate", "0.60", "-p", fpath_data3x6_noheader], data3x6ExpectedProbsStreamSampleP60[1..$]);
1251     testTsvSample(["test-b15", "-v", "41", "--rate", "0.60", "-p", fpath_data3x6_noheader], data3x6ExpectedV41ProbsStreamSampleP60[1..$]);
1252 
1253     /* Distinct sampling cases. */
1254     testTsvSample(["test-a25", "-s", "-r", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1..$]);
1255     testTsvSample(["test-a26", "-s", "-r", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]);
1256     testTsvSample(["test-a27", "-r", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]);
1257     testTsvSample(["test-a28", "-v", "71563", "-r", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]);
1258 
1259     /* Multi-file tests. */
1260     testTsvSample(["test-c1", "--header", "--static-seed",
1261                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
1262                   combo1ExpectedNoWt);
1263     testTsvSample(["test-c2", "--header", "--static-seed", "--print-random",
1264                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
1265                   combo1ExpectedNoWtProbs);
1266     testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3",
1267                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
1268                   combo1ExpectedWt3Probs);
1269     testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3",
1270                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
1271                   combo1ExpectedWt3);
1272 
1273     /* Multi-file, no headers. */
1274     testTsvSample(["test-c5", "--static-seed",
1275                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
1276                    fpath_data3x6_noheader, fpath_data3x2_noheader],
1277                   combo1ExpectedNoWt[1..$]);
1278     testTsvSample(["test-c6", "--static-seed", "--print-random",
1279                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
1280                    fpath_data3x6_noheader, fpath_data3x2_noheader],
1281                   combo1ExpectedNoWtProbs[1..$]);
1282     testTsvSample(["test-c7", "--static-seed", "--print-random", "--weight-field", "3",
1283                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
1284                    fpath_data3x6_noheader, fpath_data3x2_noheader],
1285                   combo1ExpectedWt3Probs[1..$]);
1286     testTsvSample(["test-c8", "--static-seed", "--weight-field", "3",
1287                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
1288                    fpath_data3x6_noheader, fpath_data3x2_noheader],
1289                   combo1ExpectedWt3[1..$]);
1290 
1291     /* Stream sampling cases. */
1292     testTsvSample(["test-c9", "--header", "--static-seed", "--print-random", "--rate", ".5",
1293                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
1294                   combo1ExpectedProbsStreamSampleP50);
1295     testTsvSample(["test-c10", "--header", "--static-seed", "--rate", ".4",
1296                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
1297                   combo1ExpectedStreamSampleP40);
1298     testTsvSample(["test-c11", "--static-seed", "--print-random", "--rate", ".5",
1299                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
1300                    fpath_data3x6_noheader, fpath_data3x2_noheader],
1301                   combo1ExpectedProbsStreamSampleP50[1..$]);
1302     testTsvSample(["test-c12", "--static-seed", "--rate", ".4",
1303                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
1304                    fpath_data3x6_noheader, fpath_data3x2_noheader],
1305                   combo1ExpectedStreamSampleP40[1..$]);
1306 
1307     /* Distinct sampling cases. */
1308     testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--rate", ".4",
1309                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
1310                   combo1ExpectedDistinctSampleK1P40);
1311     testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--rate", ".4",
1312                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
1313                    fpath_data3x6_noheader, fpath_data3x2_noheader],
1314                   combo1ExpectedDistinctSampleK1P40[1..$]);
1315 
1316     /* Single column file. */
1317     testTsvSample(["test-d1", "-H", "-s", fpath_data1x10], data1x10ExpectedNoWt);
1318     testTsvSample(["test-d1", "-H", "-s", fpath_data1x10], data1x10ExpectedNoWt);
1319 
1320     /* Distributions. */
1321     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "-p", fpath_data2x10a], data2x10aExpectedWt2Probs);
1322     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "-p", fpath_data2x10b], data2x10bExpectedWt2Probs);
1323     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "-p", fpath_data2x10c], data2x10cExpectedWt2Probs);
1324     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "-p", fpath_data2x10d], data2x10dExpectedWt2Probs);
1325     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "-p", fpath_data2x10e], data2x10eExpectedWt2Probs);
1326 
1327     /* Tests of subset sample (--n|num) field.
1328      *
1329      * Note: The way these tests are done ensures that subset length does not affect
1330      * output order.
1331      */
1332     import std.algorithm : min;
1333     for (size_t n = data3x6.length + 2; n >= 1; n--)
1334     {
1335         size_t expectedLength = min(data3x6.length, n + 1);
1336         testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string,
1337                        "-H", fpath_data3x6], data3x6ExpectedNoWt[0..expectedLength]);
1338 
1339         testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string,
1340                        "-H", "-p", fpath_data3x6], data3x6ExpectedNoWtProbs[0..expectedLength]);
1341 
1342         testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string,
1343                        "-H", "-w", "3", fpath_data3x6], data3x6ExpectedWt3[0..expectedLength]);
1344 
1345         testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string,
1346                        "-H", "-p", "-w", "3", fpath_data3x6], data3x6ExpectedWt3Probs[0..expectedLength]);
1347 
1348         testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string,
1349                        fpath_data3x6_noheader], data3x6ExpectedNoWt[1..expectedLength]);
1350 
1351         testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string,
1352                        "-p", fpath_data3x6_noheader], data3x6ExpectedNoWtProbs[1..expectedLength]);
1353 
1354         testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string,
1355                        "-w", "3", fpath_data3x6_noheader], data3x6ExpectedWt3[1..expectedLength]);
1356 
1357         testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string,
1358                        "-p", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedWt3Probs[1..expectedLength]);
1359 
1360         import std.algorithm : min;
1361         size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedProbsStreamSampleP60.length);
1362 
1363         testTsvSample([format("test-f9_%d", n), "-s", "-r", "0.6", "-n", n.to!string,
1364                        "-H", "-p", fpath_data3x6], data3x6ExpectedProbsStreamSampleP60[0..sampleExpectedLength]);
1365 
1366         testTsvSample([format("test-f10_%d", n), "-s", "-r", "0.6", "-n", n.to!string,
1367                        "-H", fpath_data3x6], data3x6ExpectedStreamSampleP60[0..sampleExpectedLength]);
1368 
1369         testTsvSample([format("test-f11_%d", n), "-s", "-r", "0.6", "-n", n.to!string,
1370                        "-p", fpath_data3x6_noheader], data3x6ExpectedProbsStreamSampleP60[1..sampleExpectedLength]);
1371 
1372         testTsvSample([format("test-f12_%d", n), "-s", "-r", "0.6", "-n", n.to!string,
1373                        fpath_data3x6_noheader], data3x6ExpectedStreamSampleP60[1..sampleExpectedLength]);
1374 
1375         size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctSampleK1K3P60.length);
1376 
1377         testTsvSample([format("test-f13_%d", n), "-s", "-k", "1,3", "-r", "0.6", "-n", n.to!string,
1378                        "-H", fpath_data3x6], data3x6ExpectedDistinctSampleK1K3P60[0..distinctExpectedLength]);
1379 
1380         testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-r", "0.6", "-n", n.to!string,
1381                        fpath_data3x6_noheader], data3x6ExpectedDistinctSampleK1K3P60[1..distinctExpectedLength]);
1382     }
1383 
1384     /* Similar tests with the 1x10 data set. */
1385     for (size_t n = data1x10.length + 2; n >= 1; n--)
1386     {
1387         size_t expectedLength = min(data1x10.length, n + 1);
1388         testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string,
1389                        "-H", fpath_data1x10], data1x10ExpectedNoWt[0..expectedLength]);
1390 
1391         testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string,
1392                        "-H", "-w", "1", fpath_data1x10], data1x10ExpectedWt1[0..expectedLength]);
1393 
1394         testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string,
1395                        fpath_data1x10_noheader], data1x10ExpectedNoWt[1..expectedLength]);
1396 
1397         testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string,
1398                        "-w", "1", fpath_data1x10_noheader], data1x10ExpectedWt1[1..expectedLength]);
1399     }
1400 
1401     /* Distinct sampling tests. */
1402     testTsvSample(["h1", "--header", "--static-seed", "--rate", "0.40", "--key-fields", "2", fpath_data5x25],
1403                   data5x25ExpectedDistinctSampleK2P40);
1404 
1405     testTsvSample(["h2", "-H", "-s", "-r", "0.20", "-k", "2,4", fpath_data5x25],
1406                   data5x25ExpectedDistinctSampleK2K4P20);
1407 
1408     testTsvSample(["h3", "-H", "-s", "-r", "0.20", "-k", "2-4", fpath_data5x25],
1409                   data5x25ExpectedDistinctSampleK2K3K4P20);
1410 
1411     testTsvSample(["h4", "--static-seed", "--rate", "0.40", "--key-fields", "2", fpath_data5x25_noheader],
1412                   data5x25ExpectedDistinctSampleK2P40[1..$]);
1413 
1414     testTsvSample(["h5", "-s", "-r", "0.20", "-k", "2,4", fpath_data5x25_noheader],
1415                   data5x25ExpectedDistinctSampleK2K4P20[1..$]);
1416 
1417     testTsvSample(["h6", "-s", "-r", "0.20", "-k", "2-4", fpath_data5x25_noheader],
1418                   data5x25ExpectedDistinctSampleK2K3K4P20[1..$]);
1419 }