1 /**
2 Command line tool for randomizing or sampling lines from input streams. Several
3 sampling methods are available, including simple random sampling, weighted random
4 sampling, Bernoulli sampling, and distinct sampling.
5 
6 Copyright (c) 2017-2018, eBay Software Foundation
7 Initially written by Jon Degenhardt
8 
9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
10 */
11 module tsv_sample;
12 
13 import std.range;
14 import std.stdio;
15 import std.typecons : tuple, Flag;
16 
17 version(unittest)
18 {
19     // When running unit tests, use main from -main compiler switch.
20 }
21 else
22 {
23     int main(string[] cmdArgs)
24     {
25         /* When running in DMD code coverage mode, turn on report merging. */
26         version(D_Coverage) version(DigitalMars)
27         {
28             import core.runtime : dmd_coverSetMerge;
29             dmd_coverSetMerge(true);
30         }
31 
32         TsvSampleOptions cmdopt;
33         auto r = cmdopt.processArgs(cmdArgs);
34         if (!r[0]) return r[1];
35         version(LDC_Profile)
36         {
37             import ldc.profile : resetAll;
38             resetAll();
39         }
40         try
41         {
42             import tsvutil : BufferedOutputRange;
43             auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
44 
45             tsvSample(cmdopt, bufferedOutput);
46         }
47         catch (Exception exc)
48         {
49             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
50             return 1;
51         }
52         return 0;
53     }
54 }
55 
56 auto helpText = q"EOS
57 Synopsis: tsv-sample [options] [file...]
58 
59 Sample input lines or randomize their order. Several modes of operation
60 are available:
61 * Line order randomization (the default): All input lines are output in a
62   random order. All orderings are equally likely.
63 * Weighted line order randomization (--w|weight-field): Lines are selected
64   using weighted random sampling, with the weight taken from a field.
65   Lines are output in weighted selection order, reordering the lines.
66 * Sampling with replacement (--r|replace, --n|num): All input is read into
67   memory, then lines are repeatedly selected at random and written out. This
68   continues until --n|num samples are output. Lines can be selected multiple
69   times. Output continues forever if --n|num is zero or not specified.
70 * Bernoulli sampling (--p|prob): A random subset of lines is output based
71   on an inclusion probability. This is a streaming operation. A selection
72   decision is made on each line as is it read. Line order is not changed.
73 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled
74   based on the values in the key field. A subset of the keys are chosen
75   based on the inclusion probability (a 'distinct' set of keys). All lines
76   with one of the selected keys are output. Line order is not changed.
77 
78 The '--n|num' option limits the sample size produced. It speeds up line
79 order randomization and weighted sampling significantly. It is also used
80 to terminate sampling with replacement.
81 
82 Use '--help-verbose' for detailed information.
83 
84 Options:
85 EOS";
86 
87 auto helpTextVerbose = q"EOS
88 Synopsis: tsv-sample [options] [file...]
89 
90 Sample input lines or randomize their order. Several modes of operation
91 are available:
92 * Line order randomization (the default): All input lines are output in a
93   random order. All orderings are equally likely.
94 * Weighted line order randomization (--w|weight-field): Lines are selected
95   using weighted random sampling, with the weight taken from a field.
96   Lines are output in weighted selection order, reordering the lines.
97 * Sampling with replacement (--r|replace, --n|num): All input is read into
98   memory, then lines are repeatedly selected at random and written out. This
99   continues until --n|num samples are output. Lines can be selected multiple
100   times. Output continues forever if --n|num is zero or not specified.
101 * Bernoulli sampling (--p|prob): A random subset of lines is output based
102   on an inclusion probability. This is a streaming operation. A selection
103   decision is made on each line as is it read. Lines order is not changed.
104 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled
105   based on the values in the key field. A subset of the keys are chosen
106   based on the inclusion probability (a 'distinct' set of keys). All lines
107   with one of the selected keys are output. Line order is not changed.
108 
109 Sample size: The '--n|num' option limits the sample size produced. This
110 speeds up line order randomization and weighted sampling significantly
111 (details below). It is also used to terminate sampling with replacement.
112 
113 Controlling the random seed: By default, each run produces a different
114 randomization or sampling. Using '--s|static-seed' changes this so
115 multiple runs produce the same results. This works by using the same
116 random seed each run. The random seed can be specified using
117 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero
118 value is a no-op and ignored.)
119 
120 Memory use: Bernoulli sampling and distinct sampling make decisions on
121 each line as it is read, so there is no memory accumulation. These
122 algorithms support arbitrary size inputs. Sampling with replacement reads
123 all lines into memory and is limited by available memory. The line order
124 randomization algorithms hold the full output set in memory prior to
125 generating results. This ultimately limits the size of the output set. For
126 these memory needs can be reduced by using a sample size (--n|num). This
127 engages reservior sampling. Output order is not affected. Both
128 'tsv-sample -n 1000' and 'tsv-sample | head -n 1000' produce the same
129 results, but the former is quite a bit faster.
130 
131 Weighted sampling: Weighted random sampling is done using an algorithm
132 described by Pavlos Efraimidis and Paul Spirakis. Weights should be
133 positive values representing the relative weight of the entry in the
134 collection. Counts and similar can be used as weights, it is *not*
135 necessary to normalize to a [0,1] interval. Negative values are not
136 meaningful and given the value zero. Input order is not retained, instead
137 lines are output ordered by the randomized weight that was assigned. This
138 means that a smaller valid sample can be produced by taking the first N
139 lines of output. For more info on the sampling approach see:
140 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling
141 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis
142   (https://arxiv.org/abs/1012.0256)
143 
144 Printing random values: Most of the sampling algorithms work by generating
145 a random value for each line. (See "Compatibility mode" below.) The nature
146 of these values depends on the sampling algorithm. They are used for both
147 line selection and output ordering. The '--p|print-random' option can be
148 used to print these values. The random value is prepended to the line
149 separated by the --d|delimiter char (TAB by default). The
150 '--q|gen-random-inorder' option takes this one step further, generating
151 random values for all input lines without changing the input order. The
152 types of values currently used by these sampling algorithms:
153 * Unweighted sampling: Uniform random value in the interval [0,1]. This
154   includes Bernoulli sampling and unweighted line order randomization.
155 * Weighted sampling: Value in the interval [0,1]. Distribution depends on
156   the values in the weight field. It is used as a partial ordering.
157 * Distinct sampling: An integer, zero and up, representing a selection
158   group. The inclusion probability determines the number of selection groups.
159 * Sampling with replacement: Random value printing is not supported.
160 
161 The specifics behind these random values are subject to change in future
162 releases.
163 
164 Compatibility mode: As described above, many of the sampling algorithms
165 assign a random value to each line. This is useful when printing random
166 values. It has another occasionally useful property: repeated runs with
167 the same static seed but different selection parameters are more
168 compatible with each other, as each line gets assigned the same random
169 value on every run. For example, if Bernoulli sampling is run with
170 '--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed',
171 all the lines selected in the first run will be selected in the second.
172 This comes at a cost: in some cases there are faster algorithms that don't
173 preserve this property. By default, tsv-sample will use faster algorithms
174 when available. However, the '--compatibility-mode' option switches to
175 algorithms that assign a random value per line. Printing random values
176 also engages compatibility mode.
177 
178 Options:
179 EOS";
180 
181 /** Container for command line options.
182  */
183 struct TsvSampleOptions
184 {
185     string programName;
186     string[] files;
187     bool helpVerbose = false;                  // --help-verbose
188     bool hasHeader = false;                    // --H|header
189     size_t sampleSize = 0;                     // --n|num - Size of the desired sample
190     double inclusionProbability = double.nan;  // --p|prob - Inclusion probability
191     size_t[] keyFields;                        // --k|key-fields - Used with inclusion probability
192     size_t weightField = 0;                    // --w|weight-field - Field holding the weight
193     bool srsWithReplacement = false;           // --r|replace
194     bool staticSeed = false;                   // --s|static-seed
195     uint seedValueOptionArg = 0;               // --v|seed-value
196     bool printRandom = false;                  // --print-random
197     bool genRandomInorder = false;             // --gen-random-inorder
198     string randomValueHeader = "random_value"; // --random-value-header
199     bool compatibilityMode = false;            // --compatibility-mode
200     char delim = '\t';                         // --d|delimiter
201     bool versionWanted = false;                // --V|version
202     bool preferSkipSampling = false;           // --prefer-skip-sampling
203     bool preferAlgorithmR = false;             // --prefer-algorithm-r
204     bool hasWeightField = false;               // Derived.
205     bool useBernoulliSampling = false;         // Derived.
206     bool useDistinctSampling = false;          // Derived.
207     bool usingUnpredictableSeed = true;        // Derived from --static-seed, --seed-value
208     uint seed = 0;                             // Derived from --static-seed, --seed-value
209 
210     auto processArgs(ref string[] cmdArgs)
211     {
212         import std.algorithm : canFind;
213         import std.getopt;
214         import std.math : isNaN;
215         import std.path : baseName, stripExtension;
216         import std.typecons : Yes, No;
217         import tsvutil : makeFieldListOptionHandler;
218 
219         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
220 
221         try
222         {
223             arraySep = ",";    // Use comma to separate values in command line options
224             auto r = getopt(
225                 cmdArgs,
226                 "help-verbose",    "     Print more detailed help.", &helpVerbose,
227 
228                 std.getopt.config.caseSensitive,
229                 "H|header",        "     Treat the first line of each file as a header.", &hasHeader,
230                 std.getopt.config.caseInsensitive,
231 
232                 "n|num",           "NUM  Maximim number of lines to output. All selected lines are output if not provided or zero.", &sampleSize,
233                 "p|prob",          "NUM  Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability,
234 
235                 "k|key-fields",    "<field-list>  Fields to use as key for distinct sampling. Use with --p|prob.",
236                 keyFields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex),
237 
238                 "w|weight-field",  "NUM  Field containing weights. All lines get equal weight if not provided or zero.", &weightField,
239                 "r|replace",       "     Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement,
240                 "s|static-seed",   "     Use the same random seed every run.", &staticSeed,
241 
242                 std.getopt.config.caseSensitive,
243                 "v|seed-value",    "NUM  Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg,
244                 std.getopt.config.caseInsensitive,
245 
246                 "print-random",       "     Include the assigned random value (prepended) when writing output lines.", &printRandom,
247                 "gen-random-inorder", "     Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder,
248                 "random-value-header",  "     Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader,
249                 "compatibility-mode", "     Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode,
250 
251                 "d|delimiter",     "CHR  Field delimiter.", &delim,
252 
253                 std.getopt.config.caseSensitive,
254                 "V|version",       "     Print version information and exit.", &versionWanted,
255                 std.getopt.config.caseInsensitive,
256 
257                 "prefer-skip-sampling", "     (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.",
258                 &preferSkipSampling,
259 
260                 "prefer-algorithm-r",   "     (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.",
261                 &preferAlgorithmR,
262                 );
263 
264             if (r.helpWanted)
265             {
266                 defaultGetoptPrinter(helpText, r.options);
267                 return tuple(false, 0);
268             }
269             else if (helpVerbose)
270             {
271                 defaultGetoptPrinter(helpTextVerbose, r.options);
272                 return tuple(false, 0);
273             }
274             else if (versionWanted)
275             {
276                 import tsvutils_version;
277                 writeln(tsvutilsVersionNotice("tsv-sample"));
278                 return tuple(false, 0);
279             }
280 
281             /* Derivations and validations. */
282             if (weightField > 0)
283             {
284                 hasWeightField = true;
285                 weightField--;    // Switch to zero-based indexes.
286             }
287 
288             if (srsWithReplacement)
289             {
290                 if (hasWeightField)
291                 {
292                     throw new Exception("Sampling with replacement (--r|replace) does not support wieghts (--w|weight-field).");
293                 }
294                 else if (!inclusionProbability.isNaN)
295                 {
296                     throw new Exception("Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob).");
297                 }
298                 else if (keyFields.length > 0)
299                 {
300                     throw new Exception("Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields).");
301                 }
302                 else if (printRandom || genRandomInorder)
303                 {
304                     throw new Exception("Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder).");
305                 }
306             }
307 
308             if (keyFields.length > 0)
309             {
310                 if (inclusionProbability.isNaN) throw new Exception("--p|prob is required when using --k|key-fields.");
311             }
312 
313             /* Inclusion probability (--p|prob) is used for both Bernoulli sampling and distinct sampling. */
314             if (!inclusionProbability.isNaN)
315             {
316                 if (inclusionProbability <= 0.0 || inclusionProbability > 1.0)
317                 {
318                     import std.format : format;
319                     throw new Exception(
320                         format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability));
321                 }
322 
323                 if (keyFields.length > 0) useDistinctSampling = true;
324                 else useBernoulliSampling = true;
325 
326                 if (hasWeightField) throw new Exception("--w|weight-field and --p|prob cannot be used together.");
327 
328                 if (genRandomInorder && !useDistinctSampling)
329                 {
330                     throw new Exception("--q|gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used.");
331                 }
332             }
333             else if (genRandomInorder && !hasWeightField)
334             {
335                 useBernoulliSampling = true;
336             }
337 
338             if (randomValueHeader.length == 0 || randomValueHeader.canFind('\n') ||
339                 randomValueHeader.canFind(delim))
340             {
341                 throw new Exception("--randomValueHeader must be at least one character and not contain field delimiters or newlines.");
342             }
343 
344             /* Random value printing implies compatibility-mode, otherwise user's selection is used. */
345             if (printRandom || genRandomInorder) compatibilityMode = true;
346 
347             /* Seed. */
348             import std.random : unpredictableSeed;
349 
350             usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0);
351 
352             if (usingUnpredictableSeed) seed = unpredictableSeed;
353             else if (seedValueOptionArg != 0) seed = seedValueOptionArg;
354             else if (staticSeed) seed = 2438424139;
355             else assert(0, "Internal error, invalid seed option states.");
356 
357             /* Assume remaining args are files. Use standard input if files were not provided. */
358             files ~= (cmdArgs.length > 1) ? cmdArgs[1..$] : ["-"];
359             cmdArgs.length = 1;
360         }
361         catch (Exception exc)
362         {
363             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
364             return tuple(false, 1);
365         }
366         return tuple(true, 0);
367     }
368 }
369 /** Invokes the appropriate sampling routine based on the command line arguments.
370  */
371 void tsvSample(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
372 if (isOutputRange!(OutputRange, char))
373 {
374     if (cmdopt.srsWithReplacement)
375     {
376         simpleRandomSamplingWithReplacement(cmdopt, outputStream);
377     }
378     else if (cmdopt.useBernoulliSampling)
379     {
380         bernoulliSamplingCommand(cmdopt, outputStream);
381     }
382     else if (cmdopt.useDistinctSampling)
383     {
384         if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
385         else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream);
386     }
387     else if (cmdopt.genRandomInorder)
388     {
389         /* Note that the preceeding cases handle gen-random-inorder themselves (Bernoulli,
390          * Distinct), or don't handle it (SRS w/ Replacement).
391          */
392         assert(cmdopt.hasWeightField);
393         generateWeightedRandomValuesInorder(cmdopt, outputStream);
394     }
395     else if (cmdopt.sampleSize != 0)
396     {
397         reservoirSamplingCommand(cmdopt, outputStream);
398     }
399     else
400     {
401         randomizeLinesCommand(cmdopt, outputStream);
402     }
403 }
404 
405 /** Bernoulli sampling on the input stream.
406  *
407  * This routine selects the appropriate bernoulli sampling function and template
408  * instantiation to use based on the command line arguments.
409  *
410  * See the bernoulliSkipSampling routine for a discussion of the choices behind the
411  * skipSamplingProbabilityThreshold used here.
412  */
413 void bernoulliSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
414 if (isOutputRange!(OutputRange, char))
415 {
416     assert(!cmdopt.hasWeightField);
417 
418     immutable double skipSamplingProbabilityThreshold = 0.04;
419 
420     if (cmdopt.compatibilityMode ||
421         (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling))
422     {
423         if (cmdopt.genRandomInorder)
424         {
425             bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
426         }
427         else
428         {
429             bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream);
430         }
431     }
432     else
433     {
434         bernoulliSkipSampling(cmdopt, outputStream);
435     }
436 }
437 
438 /** Bernoulli sampling on the input stream.
439  *
440  * Each input line is a assigned a random value and output if less than
441  * cmdopt.inclusionProbability. The order of the lines is not changed.
442  *
443  * This routine supports random value printing and gen-random-inorder value printing.
444  */
445 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
446     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
447 if (isOutputRange!(OutputRange, char))
448 {
449     import std.format : formatValue, singleSpec;
450     import std.random : Random = Mt19937, uniform01;
451     import tsvutil : throwIfWindowsNewlineOnUnix;
452 
453     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
454     else assert(!cmdopt.genRandomInorder);
455 
456     auto randomGenerator = Random(cmdopt.seed);
457     immutable randomValueFormatSpec = singleSpec("%.17g");
458 
459     /* Process each line. */
460     bool headerWritten = false;
461     size_t numLinesWritten = 0;
462     foreach (filename; cmdopt.files)
463     {
464         auto inputStream = (filename == "-") ? stdin : filename.File();
465         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
466         {
467             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
468             if (fileLineNum == 1 && cmdopt.hasHeader)
469             {
470                 if (!headerWritten)
471                 {
472                     static if (generateRandomAll)
473                     {
474                         outputStream.put(cmdopt.randomValueHeader);
475                         outputStream.put(cmdopt.delim);
476                     }
477                     else if (cmdopt.printRandom)
478                     {
479                         outputStream.put(cmdopt.randomValueHeader);
480                         outputStream.put(cmdopt.delim);
481                     }
482 
483                     outputStream.put(line);
484                     outputStream.put("\n");
485                     headerWritten = true;
486                 }
487             }
488             else
489             {
490                 double lineScore = uniform01(randomGenerator);
491 
492                 static if (generateRandomAll)
493                 {
494                     outputStream.formatValue(lineScore, randomValueFormatSpec);
495                     outputStream.put(cmdopt.delim);
496                     outputStream.put(line);
497                     outputStream.put("\n");
498 
499                     if (cmdopt.sampleSize != 0)
500                     {
501                         ++numLinesWritten;
502                         if (numLinesWritten == cmdopt.sampleSize) return;
503                     }
504                 }
505                 else if (lineScore < cmdopt.inclusionProbability)
506                 {
507                     if (cmdopt.printRandom)
508                     {
509                         outputStream.formatValue(lineScore, randomValueFormatSpec);
510                         outputStream.put(cmdopt.delim);
511                     }
512                     outputStream.put(line);
513                     outputStream.put("\n");
514 
515                     if (cmdopt.sampleSize != 0)
516                     {
517                         ++numLinesWritten;
518                         if (numLinesWritten == cmdopt.sampleSize) return;
519                     }
520                 }
521             }
522         }
523     }
524 }
525 
526 /* bernoulliSkipSampling is an alternate implementation of bernoulliSampling that
527  * uses skip sampling.
528  *
529  * Skip sampling works by skipping a random number of lines between selections. This
530  * can be faster than assigning a random value to each line when the inclusion
531  * probability is low, as it reduces the number of calls to the random number
532  * generator. Both the random number generator and the log() function as called when
533  * calculating the next skip size. These additional log() calls add up as the
534  * probability increases.
535  *
536  * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for
537  * file-oriented line sampling. This is obviously environment specific. In the
538  * environments this implementation has been tested in the perfmance improvements
539  * remain small, less than 7%, even with an inclusion probability as low as 0.0001.
540  *
541  * The algorithm does not assign random values to individual lines. This makes it
542  * incompatible with random value printing. It is not suitable for compatibility mode
543  * either. As an example, in compatibility mode a line selected with '--prob 0.2' should
544  * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling
545  * does not have this property.
546  *
547  * The algorithm for calculating the skip size has been described by multiple sources.
548  * There are two key variants depending on whether the total number of lines in the
549  * data set is known in advance. (This implementation does not know the total.)
550  * Useful references:
551  * - Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling",
552  *   ACM Trans on Mathematical Software, 1987. On-line:
553  *   http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf
554  * - P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book
555  *   "Data Stream Management", Springer-Verlag, 2016. On-line:
556  *   https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf
557  * - Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line:
558  *   http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/
559  */
560 void bernoulliSkipSampling(OutputRange)(TsvSampleOptions cmdopt, OutputRange outputStream)
561     if (isOutputRange!(OutputRange, char))
562 {
563     import std.conv : to;
564     import std.math : log, trunc;
565     import std.random : Random = Mt19937, uniform01;
566     import tsvutil : throwIfWindowsNewlineOnUnix;
567 
568     assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0);
569     assert(!cmdopt.printRandom);
570     assert(!cmdopt.compatibilityMode);
571 
572     auto randomGenerator = Random(cmdopt.seed);
573 
574     immutable double discardRate = 1.0 - cmdopt.inclusionProbability;
575     immutable double logDiscardRate = log(discardRate);
576 
577     /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed
578      * interval to (0.0, 1.0], excluding 0.0.
579      */
580     size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
581 
582     /* Process each line. */
583     bool headerWritten = false;
584     size_t numLinesWritten = 0;
585     foreach (filename; cmdopt.files)
586     {
587         auto inputStream = (filename == "-") ? stdin : filename.File();
588         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
589         {
590             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
591             if (fileLineNum == 1 && cmdopt.hasHeader)
592             {
593                 if (!headerWritten)
594                 {
595                     outputStream.put(line);
596                     outputStream.put("\n");
597                     headerWritten = true;
598                 }
599             }
600             else if (remainingSkips > 0)
601             {
602                 --remainingSkips;
603             }
604             else
605             {
606                 outputStream.put(line);
607                 outputStream.put("\n");
608 
609                 if (cmdopt.sampleSize != 0)
610                 {
611                     ++numLinesWritten;
612                     if (numLinesWritten == cmdopt.sampleSize) return;
613                 }
614 
615                 remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
616             }
617         }
618     }
619 }
620 
621 /** Sample a subset of the unique values from the key fields.
622  *
623  * Distinct sampling is done by hashing the key and mapping the hash value into
624  * buckets matching the inclusion probability. Records having a key mapping to bucket
625  * zero are output.
626  *
627  * TODO: Add whole line as key.
628  */
629 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
630     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
631 if (isOutputRange!(OutputRange, char))
632 {
633     import std.algorithm : splitter;
634     import std.conv : to;
635     import std.digest.murmurhash;
636     import std.math : lrint;
637     import tsvutil : InputFieldReordering, throwIfWindowsNewlineOnUnix;
638 
639     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
640     else assert(!cmdopt.genRandomInorder);
641 
642     assert(cmdopt.keyFields.length > 0);
643     assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0);
644 
645     static if (generateRandomAll)
646     {
647         import std.format : formatValue, singleSpec;
648         immutable randomValueFormatSpec = singleSpec("%d");
649     }
650 
651     immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys.
652 
653     uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint;
654 
655     /* Create a mapping for the key fields. */
656     auto keyFieldsReordering = new InputFieldReordering!char(cmdopt.keyFields);
657 
658     /* Process each line. */
659     bool headerWritten = false;
660     size_t numLinesWritten = 0;
661     foreach (filename; cmdopt.files)
662     {
663         auto inputStream = (filename == "-") ? stdin : filename.File();
664         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
665         {
666             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
667             if (fileLineNum == 1 && cmdopt.hasHeader)
668             {
669                 if (!headerWritten)
670                 {
671                     static if (generateRandomAll)
672                     {
673                         outputStream.put(cmdopt.randomValueHeader);
674                         outputStream.put(cmdopt.delim);
675                     }
676                     else if (cmdopt.printRandom)
677                     {
678                         outputStream.put(cmdopt.randomValueHeader);
679                         outputStream.put(cmdopt.delim);
680                     }
681 
682                     outputStream.put(line);
683                     outputStream.put("\n");
684                     headerWritten = true;
685                 }
686             }
687             else
688             {
689                 /* Gather the key field values and assemble the key. */
690                 keyFieldsReordering.initNewLine;
691                 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
692                 {
693                     keyFieldsReordering.processNextField(fieldIndex, fieldValue);
694                     if (keyFieldsReordering.allFieldsFilled) break;
695                 }
696 
697                 if (!keyFieldsReordering.allFieldsFilled)
698                 {
699                     import std.format : format;
700                     throw new Exception(
701                         format("Not enough fields in line. File: %s, Line: %s",
702                                (filename == "-") ? "Standard Input" : filename, fileLineNum));
703                 }
704 
705                 auto hasher = MurmurHash3!32(cmdopt.seed);
706                 foreach (count, key; keyFieldsReordering.outputFields.enumerate)
707                 {
708                     if (count > 0) hasher.put(delimArray);
709                     hasher.put(cast(ubyte[]) key);
710                 }
711                 hasher.finish;
712 
713                 static if (generateRandomAll)
714                 {
715                     import std.conv : to;
716                     outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec);
717                     outputStream.put(cmdopt.delim);
718                     outputStream.put(line);
719                     outputStream.put("\n");
720 
721                     if (cmdopt.sampleSize != 0)
722                     {
723                         ++numLinesWritten;
724                         if (numLinesWritten == cmdopt.sampleSize) return;
725                     }
726                 }
727                 else if (hasher.get % numBuckets == 0)
728                 {
729                     if (cmdopt.printRandom)
730                     {
731                         outputStream.put('0');
732                         outputStream.put(cmdopt.delim);
733                     }
734                     outputStream.put(line);
735                     outputStream.put("\n");
736 
737                     if (cmdopt.sampleSize != 0)
738                     {
739                         ++numLinesWritten;
740                         if (numLinesWritten == cmdopt.sampleSize) return;
741                     }
742                 }
743             }
744         }
745     }
746 }
747 
748 /** Reservoir sampling on the input stream.
749  *
750  * This routine selects the appropriate reservior sampling function and template
751  * instantiation to use based on the command line arguments.
752  *
753  * Reservoir sampling is used when a fixed size sample is being pulled from an input
754  * stream. Weighted and unweighted sampling is supported. These routines also
755  * randomize the order of the selected lines. This is consistent with line order
756  * randomization of the entire input stream (handled by randomizeLinesCommand).
757  *
758  * For unweighted sampling, there is a performance tradeoff choice between the two
759  * available implementations. See the reservoirSampling documentation for
760  * information. The threshold used here was chosen based on performance tests.
761  */
762 
763 void reservoirSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
764 if (isOutputRange!(OutputRange, char))
765 {
766     assert(cmdopt.sampleSize != 0);
767 
768     immutable size_t algorithmRSampleSizeThreshold = 128 * 1024;
769 
770     if (cmdopt.hasWeightField)
771     {
772         reservoirSamplingViaHeap!(Yes.isWeighted)(cmdopt, outputStream);
773     }
774     else if (cmdopt.compatibilityMode ||
775              (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR))
776     {
777         reservoirSamplingViaHeap!(No.isWeighted)(cmdopt, outputStream);
778     }
779     else
780     {
781         reservoirSamplingAlgorithmR(cmdopt, outputStream);
782     }
783 }
784 
785 /** Reservior sampling using a heap. Both weighted and unweighted random sampling are
786  * supported.
787  *
788  * The algorithm used here is based on the one-pass algorithm described by Pavlos
789  * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S.
790  * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are
791  * simply set to one.
792  *
793  * The implementation uses a heap (priority queue) large enough to hold the desired
794  * number of lines. Input is read line-by-line, assigned a random value, and added to
795  * the heap. The role of the identify the lines with the highest assigned random
796  * values. Once the heap is full, adding a new line means dropping the line with the
797  * lowest score. A "min" heap used for this reason.
798  *
799  * When done reading all lines, the "min" heap is in the opposite order needed for
800  * output. The desired order is obtained by removing each element one at at time from
801  * the heap. The underlying data store will have the elements in correct order.
802  *
803  * Generating output in weighted order matters for several reasons:
804  *  - For weighted sampling, it preserves the property that smaller valid subsets can be
805  *    created by taking the first N lines.
806  *  - For unweighted sampling, it ensures that all output permutations are possible, and
807  *    are not influences by input order or the heap data structure used.
808  *  - Order consistency when making repeated use of the same random seeds, but with
809  *    different sample sizes.
810  *
811  * There are use cases where only the selection set matters, for these some performance
812  * could be gained by skipping the reordering and simply printing the backing store
813  * array in-order, but making this distinction seems an unnecessary complication.
814  *
815  * Notes:
816  *  - In tsv-sample versions 1.2.1 and earlier this routine also supported randomization
817  *    of all input lines. This was dropped in version 1.2.2 in favor of the approach
818  *    used in randomizeLines. The latter has significant advantages given that all data
819  *    data must be read into memory.
820  *  - For larger reservoir sizes better performance can be achieved by using
821  *    reservoirSamplingAlgorithmR. See the documentation for that function for details.
822  */
823 void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, OutputRange)
824     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
825 if (isOutputRange!(OutputRange, char))
826 {
827     import std.container.array;
828     import std.container.binaryheap;
829     import std.format : formatValue, singleSpec;
830     import std.random : Random = Mt19937, uniform01;
831     import tsvutil : throwIfWindowsNewlineOnUnix;
832 
833     static if (isWeighted) assert(cmdopt.hasWeightField);
834     else assert(!cmdopt.hasWeightField);
835 
836     assert(cmdopt.sampleSize > 0);
837 
838     auto randomGenerator = Random(cmdopt.seed);
839 
840     struct Entry
841     {
842         double score;
843         char[] line;
844     }
845 
846     /* Create the heap and backing data store.
847      *
848      * Note: An std.container.array is used as the backing store to avoid some issues in
849      * the standard library (Phobos) binaryheap implementation. Specifically, when an
850      * std.container.array is used as backing store, the heap can efficiently reversed by
851      * removing the heap elements. This leaves the backing store in the reversed order.
852      * However, the current binaryheap implementation does not support this for all
853      * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094.
854      */
855 
856     Array!Entry dataStore;
857     dataStore.reserve(cmdopt.sampleSize);
858     auto reservoir = dataStore.heapify!("a.score > b.score")(0);  // Min binaryheap
859 
860     /* Process each line. */
861     bool headerWritten = false;
862     foreach (filename; cmdopt.files)
863     {
864         auto inputStream = (filename == "-") ? stdin : filename.File();
865         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
866         {
867             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
868             if (fileLineNum == 1 && cmdopt.hasHeader)
869             {
870                 if (!headerWritten)
871                 {
872                     if (cmdopt.printRandom)
873                     {
874                         outputStream.put(cmdopt.randomValueHeader);
875                         outputStream.put(cmdopt.delim);
876                     }
877                     outputStream.put(line);
878                     outputStream.put("\n");
879                     headerWritten = true;
880                 }
881             }
882             else
883             {
884                 static if (!isWeighted)
885                 {
886                     double lineScore = uniform01(randomGenerator);
887                 }
888                 else
889                 {
890                     double lineWeight =
891                         getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, filename, fileLineNum);
892                     double lineScore =
893                         (lineWeight > 0.0)
894                         ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
895                         : 0.0;
896                 }
897 
898                 if (reservoir.length < cmdopt.sampleSize)
899                 {
900                     reservoir.insert(Entry(lineScore, line.dup));
901                 }
902                 else if (reservoir.front.score < lineScore)
903                 {
904                     reservoir.replaceFront(Entry(lineScore, line.dup));
905                 }
906             }
907         }
908     }
909 
910     /* All entries are in the reservoir. Time to print. The heap is in reverse order
911      * of assigned weights. Reversing order is done by removing all elements from the
912      * heap, this leaves the backing store in the correct order for output.
913      *
914      * The asserts here avoid issues with the current binaryheap implementation. They
915      * detect use of backing stores having a length not synchronized to the reservoir.
916      */
917     size_t numLines = reservoir.length;
918     assert(numLines == dataStore.length);
919 
920     while (!reservoir.empty) reservoir.removeFront;
921     assert(numLines == dataStore.length);
922 
923     immutable randomValueFormatSpec = singleSpec("%.17g");
924 
925     foreach (entry; dataStore)
926     {
927         if (cmdopt.printRandom)
928         {
929             outputStream.formatValue(entry.score, randomValueFormatSpec);
930             outputStream.put(cmdopt.delim);
931         }
932         outputStream.put(entry.line);
933         outputStream.put("\n");
934     }
935  }
936 
937 /** Generates weighted random values for all input lines, preserving input order.
938  *
939  * This complements weighted reservoir sampling, but instead of using a reservoir it
940  * simply iterates over the input lines generating the values. The weighted random
941  * values are generated with the same formula used by reservoirSampling.
942  */
943 void generateWeightedRandomValuesInorder(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
944 if (isOutputRange!(OutputRange, char))
945 {
946     import std.format : formatValue, singleSpec;
947     import std.random : Random = Mt19937, uniform01;
948     import tsvutil : throwIfWindowsNewlineOnUnix;
949 
950     assert(cmdopt.hasWeightField);
951 
952     auto randomGenerator = Random(cmdopt.seed);
953     immutable randomValueFormatSpec = singleSpec("%.17g");
954 
955     /* Process each line. */
956     bool headerWritten = false;
957     size_t numLinesWritten = 0;
958     foreach (filename; cmdopt.files)
959     {
960         auto inputStream = (filename == "-") ? stdin : filename.File();
961         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
962         {
963             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
964             if (fileLineNum == 1 && cmdopt.hasHeader)
965             {
966                 if (!headerWritten)
967                 {
968                     outputStream.put(cmdopt.randomValueHeader);
969                     outputStream.put(cmdopt.delim);
970                     outputStream.put(line);
971                     outputStream.put("\n");
972                     headerWritten = true;
973                 }
974             }
975             else
976                {
977                 double lineWeight = getFieldValue!double(line, cmdopt.weightField, cmdopt.delim,
978                                                          filename, fileLineNum);
979                 double lineScore =
980                     (lineWeight > 0.0)
981                     ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
982                     : 0.0;
983 
984                 outputStream.formatValue(lineScore, randomValueFormatSpec);
985                 outputStream.put(cmdopt.delim);
986                 outputStream.put(line);
987                 outputStream.put("\n");
988 
989                 if (cmdopt.sampleSize != 0)
990                 {
991                     ++numLinesWritten;
992                     if (numLinesWritten == cmdopt.sampleSize) return;
993                 }
994             }
995         }
996     }
997 }
998 
999 /** Reservoir sampling, Algorithm R
1000  *
1001  * This is an implementation of reservoir sampling using what is commonly known as
1002  * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of
1003  * Computer Programming, Volume 2: Seminumerical Algorithms". More information about
1004  * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with
1005  * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling"
1006  * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R).
1007  *
1008  * Algorithm R is used for unweighted sampling without replacement. The heap-based
1009  * algorithm in reservoirSamplingViaHeap is used for weighted sampling.
1010  *
1011  * The classic algorithm stops after identifying the selected set of items. This
1012  * implementation goes one step further and randomizes the order of the selected
1013  * lines. This supports the tsv-sample use-case, which is line order randomization.
1014  *
1015  * This algorithm is faster than reservoirSamplingViaHeap when the sample size
1016  * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size.
1017  * Insertion in this algorithm is O(1). Similarly, generating the random order in the
1018  * heap is O(k * log k), while in this algorithm the final randomization step is O(k).
1019  *
1020  * This speed advantage may be offset a certain amount by using a more expensive random
1021  * value generator. reservoirSamplingViaHeap generates values between zero and one,
1022  * whereas reservoirSamplingAlgorithR generates random integers over and ever growing
1023  * interval. The latter is expected to be more expensive. This is consistent with
1024  * performance test indicating that reservoirSamplingViaHeap is faster when using
1025  * small-to-medium size reservoirs and large input streams.
1026  */
1027 void reservoirSamplingAlgorithmR(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1028 if (isOutputRange!(OutputRange, char))
1029 {
1030     import std.random : Random = Mt19937, randomShuffle, uniform;
1031     import tsvutil : throwIfWindowsNewlineOnUnix;
1032 
1033     assert(cmdopt.sampleSize > 0);
1034     assert(!cmdopt.hasWeightField);
1035     assert(!cmdopt.compatibilityMode);
1036     assert(!cmdopt.printRandom);
1037     assert(!cmdopt.genRandomInorder);
1038 
1039     string[] reservoir;
1040     auto reservoirAppender = appender(&reservoir);
1041     reservoirAppender.reserve(cmdopt.sampleSize);
1042 
1043     auto randomGenerator = Random(cmdopt.seed);
1044 
1045     /* Process each line. */
1046 
1047     bool headerWritten = false;
1048     size_t totalLineNum = 0;
1049     foreach (filename; cmdopt.files)
1050     {
1051         auto inputStream = (filename == "-") ? stdin : filename.File();
1052         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
1053         {
1054             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
1055             if (fileLineNum == 1 && cmdopt.hasHeader)
1056             {
1057                 if (!headerWritten)
1058                 {
1059                     outputStream.put(line);
1060                     outputStream.put("\n");
1061                     headerWritten = true;
1062                 }
1063             }
1064             else
1065             {
1066                 /* Add lines to the reservoir until the reservoir is filled.
1067                  * After that lines are added with decreasing likelihood, based on
1068                  * the total number of lines seen. If added to the reservoir, the
1069                  * line replaces a randomly chosen existing line.
1070                  */
1071                 if (totalLineNum < cmdopt.sampleSize)
1072                 {
1073                     reservoirAppender ~= line.idup;
1074                 }
1075                 else
1076                 {
1077                     size_t i = uniform(0, totalLineNum, randomGenerator);
1078                     if (i < reservoir.length) reservoir[i] = line.idup;
1079                 }
1080 
1081                 ++totalLineNum;
1082             }
1083         }
1084     }
1085 
1086     /* The random sample is now in the reservior. Shuffle it and print. */
1087 
1088     reservoir.randomShuffle(randomGenerator);
1089 
1090     foreach (ref line; reservoir)
1091     {
1092         outputStream.put(line);
1093         outputStream.put("\n");
1094     }
1095 }
1096 
1097 /** Randomize all the lines in files or standard input.
1098  *
1099  * This routine selects the appropriate randomize-lines function and template instantiation
1100  * to use based on the command line arguments.
1101  */
1102 void randomizeLinesCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1103 if (isOutputRange!(OutputRange, char))
1104 {
1105     if (cmdopt.hasWeightField)
1106     {
1107         randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream);
1108     }
1109     else if (cmdopt.compatibilityMode)
1110     {
1111         randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream);
1112     }
1113     else
1114     {
1115         randomizeLinesViaShuffle(cmdopt, outputStream);
1116     }
1117 }
1118 
1119 /** Randomize all the lines in files or standard input.
1120  *
1121  * All lines in files and/or standard input are read in and written out in random
1122  * order. This algorithm assigns a random value to each line and sorts. This approach
1123  * supports both weighted sampling and simple random sampling (unweighted).
1124  *
1125  * This is significantly faster than heap-based reservoir sampling in the case where
1126  * the entire file is being read. See also randomizeLinesViaShuffle for the unweighted
1127  * case, as it is a little faster, at the cost not supporting random value printing or
1128  * compatibility-mode.
1129  *
1130  * Input data size is limited by available memory. Disk oriented techniques are needed
1131  * when data sizes are larger. For example, generating random values line-by-line (ala
1132  * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort.
1133  */
1134 void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1135 if (isOutputRange!(OutputRange, char))
1136 {
1137     import std.algorithm : map, sort;
1138     import std.format : formatValue, singleSpec;
1139 
1140     static if (isWeighted) assert(cmdopt.hasWeightField);
1141     else assert(!cmdopt.hasWeightField);
1142 
1143     assert(cmdopt.sampleSize == 0);
1144 
1145     /*
1146      * Read all file data into memory. Then split the data into lines and assign a
1147      * random value to each line. identifyFileLines also writes the first header line.
1148      */
1149     auto fileData = cmdopt.files.map!FileData.array;
1150     auto inputLines = fileData.identifyFileLines!(Yes.hasRandomValue, isWeighted)(cmdopt, outputStream);
1151 
1152     /*
1153      * Sort by the weight and output the lines.
1154      */
1155     inputLines.sort!((a, b) => a.randomValue > b.randomValue);
1156 
1157     immutable randomValueFormatSpec = singleSpec("%.17g");
1158 
1159     foreach (lineEntry; inputLines)
1160     {
1161         if (cmdopt.printRandom)
1162         {
1163             outputStream.formatValue(lineEntry.randomValue, randomValueFormatSpec);
1164             outputStream.put(cmdopt.delim);
1165         }
1166         outputStream.put(lineEntry.data);
1167         outputStream.put("\n");
1168     }
1169 }
1170 
1171 /** Randomize all the lines in files or standard input.
1172  *
1173  * All lines in files and/or standard input are read in and written out in random
1174  * order. This routine uses array shuffling, which is faster than sorting. This makes
1175  * this routine a good alternative to randomizeLinesViaSort when doing unweighted
1176  * randomization.
1177  *
1178  * Input data size is limited by available memory. Disk oriented techniques are needed
1179  * when data sizes are larger. For example, generating random values line-by-line (ala
1180  * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort.
1181  *
1182  * This routine does not support random value printing or compatibility-mode.
1183  */
1184 void randomizeLinesViaShuffle(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1185 if (isOutputRange!(OutputRange, char))
1186 {
1187     import std.algorithm : map;
1188     import std.random : Random = Mt19937, randomShuffle;
1189 
1190     assert(cmdopt.sampleSize == 0);
1191     assert(!cmdopt.hasWeightField);
1192     assert(!cmdopt.printRandom);
1193     assert(!cmdopt.genRandomInorder);
1194 
1195     /*
1196      * Read all file data into memory and split into lines.
1197      */
1198     auto fileData = cmdopt.files.map!FileData.array;
1199     auto inputLines = fileData.identifyFileLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream);
1200 
1201     /*
1202      * Randomly shuffle and print each line.
1203      *
1204      * Note: Also tried randomCover, but that was exceedingly slow.
1205      */
1206     import std.random : randomShuffle;
1207 
1208     auto randomGenerator = Random(cmdopt.seed);
1209     inputLines.randomShuffle(randomGenerator);
1210 
1211     foreach (ref line; inputLines)
1212     {
1213         outputStream.put(line.data);
1214         outputStream.put("\n");
1215     }
1216 }
1217 
1218 /** Simple random sampling with replacement.
1219  *
1220  * All lines in files and/or standard input are read in. Then random lines are selected
1221  * one at a time and output. Lines can be selected multiple times. This process continues
1222  * until the desired number of samples (--n|num) has been output. Output continues
1223  * indefinitely if a sample size was not provided.
1224  */
1225 void simpleRandomSamplingWithReplacement(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1226 if (isOutputRange!(OutputRange, char))
1227 {
1228     import std.algorithm : map;
1229     import std.format : formatValue, singleSpec;
1230     import std.random : Random = Mt19937, uniform;
1231 
1232     /*
1233      * Read all file data into memory and split the data into lines.
1234      */
1235     auto fileData = cmdopt.files.map!FileData.array;
1236     auto inputLines = fileData.identifyFileLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream);
1237 
1238     if (inputLines.length > 0)
1239     {
1240         auto randomGenerator = Random(cmdopt.seed);
1241 
1242         /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */
1243         size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize;
1244         while (numLeft != 0)
1245         {
1246             size_t index = uniform(0, inputLines.length, randomGenerator);
1247             outputStream.put(inputLines[index].data);
1248             outputStream.put("\n");
1249             if (cmdopt.sampleSize != 0) numLeft--;
1250         }
1251     }
1252 }
1253 
1254 /** A container and reader data form a file or standard input.
1255  *
1256  * The FileData struct is used to read data from a file or standard input. It is used
1257  * by passing a filename to the constructor. The constructor reads the file data.
1258  * If the filename is a single hyphen ('-') then data is read from standard input.
1259  *
1260  * The struct make the data available through two members: 'filename', which is the
1261  * filename, and 'data', which is a character array of the data.
1262  */
1263 struct FileData
1264 {
1265     string filename;
1266     char[] data;
1267 
1268     this(string fname)
1269     {
1270         import std.algorithm : min;
1271         import std.array : appender;
1272 
1273         filename = fname;
1274 
1275         ubyte[1024 * 128] fileRawBuf;
1276         auto dataAppender = appender(&data);
1277         auto ifile = (filename == "-") ? stdin : filename.File;
1278 
1279         if (filename != "-")
1280         {
1281             ulong filesize = ifile.size;
1282             if (filesize < ulong.max) dataAppender.reserve(min(filesize, size_t.max));
1283         }
1284 
1285         foreach (ref ubyte[] buffer; ifile.byChunk(fileRawBuf)) dataAppender.put(cast(char[]) buffer);
1286     }
1287 }
1288 
1289 /** HasRandomValue is a boolean flag used at compile time by identifyFileLines to
1290  * distinguish use cases needing random value assignments from those that don't.
1291  */
1292 alias HasRandomValue = Flag!"hasRandomValue";
1293 
1294 /** An InputLine array is returned by identifyFileLines to represent each non-header line
1295  * line found in a FileData array. The 'data' element contains the line. A 'randomValue'
1296  * line is included if random values are being generated.
1297  */
1298 struct InputLine(HasRandomValue hasRandomValue)
1299 {
1300     char[] data;
1301     static if (hasRandomValue) double randomValue;
1302 }
1303 
1304 /** identifyFileLines is used by algorithms that read all files into memory prior to
1305  * processing. It does the initial processing of the file data.
1306  *
1307  * Three primary tasks are performed. One is splitting all input data into lines. The
1308  * second is writting the header line from the first file to the output stream. Header
1309  * lines from subsequent files are ignored. Third is assigning a random value to the
1310  * line, if random values are being generated.
1311  *
1312  * The key input is a FileData array, one element for each file. The FileData reads
1313  * the file when instantiated.
1314  *
1315  * The return value is an array of InputLine structs. The struct will have a 'randomValue'
1316  * member if random values are being assigned.
1317  */
1318 InputLine!hasRandomValue[] identifyFileLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted, OutputRange)
1319 (ref FileData[] fileData, TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1320 if (isOutputRange!(OutputRange, char))
1321 {
1322     import std.algorithm : splitter;
1323     import std.array : appender;
1324     import std.random : Random = Mt19937, uniform01;
1325     import tsvutil : throwIfWindowsNewlineOnUnix;
1326 
1327     static assert(hasRandomValue || !isWeighted);
1328     static if(!hasRandomValue) assert(!cmdopt.printRandom);
1329 
1330     InputLine!hasRandomValue[] inputLines;
1331 
1332     auto linesAppender = appender(&inputLines);
1333     static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed);
1334     bool headerWritten = false;
1335 
1336     foreach (fd; fileData)
1337     {
1338         /* Drop the last newline to avoid adding an extra empty line. */
1339         auto data = (fd.data.length > 0 && fd.data[$ - 1] == '\n') ? fd.data[0 .. $ - 1] : fd.data;
1340         foreach (fileLineNum, ref line; data.splitter('\n').enumerate(1))
1341         {
1342             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, fd.filename, fileLineNum);
1343             if (fileLineNum == 1 && cmdopt.hasHeader)
1344             {
1345                 if (!headerWritten)
1346                 {
1347                     if (cmdopt.printRandom)
1348                     {
1349                         outputStream.put(cmdopt.randomValueHeader);
1350                         outputStream.put(cmdopt.delim);
1351                     }
1352                     outputStream.put(line);
1353                     outputStream.put("\n");
1354                     headerWritten = true;
1355                 }
1356             }
1357             else
1358             {
1359                 static if (!hasRandomValue)
1360                 {
1361                     linesAppender.put(InputLine!hasRandomValue(line));
1362                 }
1363                 else
1364                 {
1365                     static if (!isWeighted)
1366                     {
1367                         double randomValue = uniform01(randomGenerator);
1368                     }
1369                     else
1370                     {
1371                         double lineWeight =
1372                             getFieldValue!double(line, cmdopt.weightField, cmdopt.delim,
1373                                                  fd.filename, fileLineNum);
1374                         double randomValue =
1375                             (lineWeight > 0.0)
1376                             ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1377                             : 0.0;
1378                     }
1379 
1380                     linesAppender.put(InputLine!hasRandomValue(line, randomValue));
1381                 }
1382             }
1383         }
1384     }
1385 
1386     return inputLines;
1387 }
1388 
1389 
1390 /** Convenience function for extracting a single field from a line. See getTsvFieldValue in
1391  * common/src/tsvutils.d for details. This wrapper creates error text tailored for this program.
1392  */
1393 import std.traits : isSomeChar;
1394 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, size_t lineNum) pure @safe
1395 if (isSomeChar!C)
1396 {
1397     import std.conv : ConvException, to;
1398     import std.format : format;
1399     import tsvutil : getTsvFieldValue;
1400 
1401     T val;
1402     try
1403     {
1404         val = getTsvFieldValue!T(line, fieldIndex, delim);
1405     }
1406     catch (ConvException exc)
1407     {
1408         throw new Exception(
1409             format("Could not process line: %s\n  File: %s Line: %s%s",
1410                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
1411                    (lineNum == 1) ? "\n  Is this a header line? Use --H|header to skip." : ""));
1412     }
1413     catch (Exception exc)
1414     {
1415         /* Not enough fields on the line. */
1416         throw new Exception(
1417             format("Could not process line: %s\n  File: %s Line: %s",
1418                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum));
1419     }
1420 
1421     return val;
1422 }
1423 
1424 unittest
1425 {
1426     /* getFieldValue unit tests. getTsvFieldValue has it's own tests.
1427      * These tests make basic sanity checks on the getFieldValue wrapper.
1428      */
1429     import std.exception;
1430 
1431     assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123);
1432     assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4);
1433     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1));
1434     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2));
1435     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1));
1436     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2));
1437 }
1438 
1439 /* Unit tests for the main program start here.
1440  *
1441  * Portability note: Many of the tests here rely on generating consistent random numbers
1442  * across different platforms when using the same random seed. So far this has succeeded
1443  * on several different platorm, compiler, and library versions. However, it is certainly
1444  * possible this condition will not hold on other platforms.
1445  *
1446  * For tsv-sample, this portability implies generating the same results on different
1447  * platforms when using the same random seed. This is NOT part of tsv-sample guarantees,
1448  * but it is convenient for testing. If platforms are identified that do not generate
1449  * the same results these tests will need to be adjusted.
1450  */
1451 version(unittest)
1452 {
1453     /* Unit test helper functions. */
1454 
1455     import unittest_utils;   // tsv unit test helpers, from common/src/.
1456     import std.conv : to;
1457 
1458     void testTsvSample(string[] cmdArgs, string[][] expected)
1459     {
1460         import std.array : appender;
1461         import std.format : format;
1462 
1463         assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty.");
1464 
1465         auto formatAssertMessage(T...)(string msg, T formatArgs)
1466         {
1467             auto formatString = "[testTsvSample] %s: " ~ msg;
1468             return format(formatString, cmdArgs[0], formatArgs);
1469         }
1470 
1471         TsvSampleOptions cmdopt;
1472         auto savedCmdArgs = cmdArgs.to!string;
1473         auto r = cmdopt.processArgs(cmdArgs);
1474         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
1475         auto output = appender!(char[])();
1476 
1477         tsvSample(cmdopt, output);    // This invokes the main code line.
1478 
1479         auto expectedOutput = expected.tsvDataToString;
1480 
1481         assert(output.data == expectedOutput,
1482                formatAssertMessage(
1483                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
1484                    expectedOutput.to!string, output.data.to!string));
1485     }
1486  }
1487 
1488 unittest
1489 {
1490     import std.path : buildPath;
1491     import std.file : rmdirRecurse;
1492     import std.format : format;
1493 
1494     auto testDir = makeUnittestTempDir("tsv_sample");
1495     scope(exit) testDir.rmdirRecurse;
1496 
1497     /* Tabular data sets and expected results use the built-in static seed.
1498      * Tests are run by writing the data set to a file, then calling the main
1499      * routine to process. The function testTsvSample plays the role of the
1500      * main program. Rather than writing to expected output, the results are
1501      * matched against expected. The expected results were verified by hand
1502      * prior to inclusion in the test.
1503      *
1504      * The initial part of this section is simply setting up data files and
1505      * expected results.
1506      *
1507      * Expected results naming conventions:
1508      *  - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected
1509      *  - Sampling Type (required): Permute, Replace, Bernoulli, Distinct
1510      *  - Compatibility: Compat, AlgoR, Skip, Swap
1511      *  - Weight Field: Wt<num>, e.g. Wt3
1512      *  - Sample Size: Num<num>, eg. Num3
1513      *  - Seed Value: V<num>, eg. V77
1514      *  - Key Field: K<num>, e.g. K2
1515      *  - Probability: P<num>, e.g P05 (5%)
1516      *  - Printing Probalities: Probs
1517      *  - Printing Probs in order: ProbsInorder
1518      *  - Printing Probs with custom header: RVCustom
1519      */
1520 
1521     /* Empty file. */
1522     string[][] dataEmpty = [];
1523     string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv");
1524     writeUnittestTsvFile(fpath_dataEmpty, dataEmpty);
1525 
1526     /* 3x1, header only. */
1527     string[][] data3x0 = [["field_a", "field_b", "field_c"]];
1528     string fpath_data3x0 = buildPath(testDir, "data3x0.tsv");
1529     writeUnittestTsvFile(fpath_data3x0, data3x0);
1530 
1531     /* 3x1 */
1532     string[][] data3x1 =
1533         [["field_a", "field_b", "field_c"],
1534          ["tan", "タン", "8.5"]];
1535 
1536     string fpath_data3x1 = buildPath(testDir, "data3x1.tsv");
1537     string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv");
1538     writeUnittestTsvFile(fpath_data3x1, data3x1);
1539     writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1..$]);
1540 
1541     string[][] data3x1ExpectedReplaceNum3 =
1542         [["field_a", "field_b", "field_c"],
1543          ["tan", "タン", "8.5"],
1544          ["tan", "タン", "8.5"],
1545          ["tan", "タン", "8.5"]];
1546 
1547     /* 3x2 */
1548     string[][] data3x2 =
1549         [["field_a", "field_b", "field_c"],
1550          ["brown", "褐色", "29.2"],
1551          ["gray", "グレー", "6.2"]];
1552 
1553     string fpath_data3x2 = buildPath(testDir, "data3x2.tsv");
1554     string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv");
1555     writeUnittestTsvFile(fpath_data3x2, data3x2);
1556     writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1..$]);
1557 
1558     string[][] data3x2PermuteCompat =
1559         [["field_a", "field_b", "field_c"],
1560          ["gray", "グレー", "6.2"],
1561          ["brown", "褐色", "29.2"]];
1562 
1563     string[][] data3x2PermuteShuffle =
1564         [["field_a", "field_b", "field_c"],
1565          ["gray", "グレー", "6.2"],
1566          ["brown", "褐色", "29.2"]];
1567 
1568     /* 3x3 */
1569     string[][] data3x3 =
1570         [["field_a", "field_b", "field_c"],
1571          ["orange", "オレンジ", "2.5"],
1572          ["pink", "ピンク", "1.1"],
1573          ["purple", "紫の", "42"]];
1574 
1575     string fpath_data3x3 = buildPath(testDir, "data3x3.tsv");
1576     string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv");
1577     writeUnittestTsvFile(fpath_data3x3, data3x3);
1578     writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1..$]);
1579 
1580     string[][] data3x3ExpectedPermuteCompat =
1581         [["field_a", "field_b", "field_c"],
1582          ["purple", "紫の", "42"],
1583          ["pink", "ピンク", "1.1"],
1584          ["orange", "オレンジ", "2.5"]];
1585 
1586     string[][] data3x3ExpectedPermuteSwap =
1587         [["field_a", "field_b", "field_c"],
1588          ["purple", "紫の", "42"],
1589          ["orange", "オレンジ", "2.5"],
1590          ["pink", "ピンク", "1.1"]];
1591 
1592     /* 3x6 */
1593     string[][] data3x6 =
1594         [["field_a", "field_b", "field_c"],
1595          ["red", "赤", "23.8"],
1596          ["green", "緑", "0.0072"],
1597          ["white", "白", "1.65"],
1598          ["yellow", "黄", "12"],
1599          ["blue", "青", "12"],
1600          ["black", "黒", "0.983"]];
1601     string fpath_data3x6 = buildPath(testDir, "data3x6.tsv");
1602     string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv");
1603     writeUnittestTsvFile(fpath_data3x6, data3x6);
1604     writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1..$]);
1605 
1606     // Randomization, all lines
1607     string[][] data3x6ExpectedPermuteCompat =
1608         [["field_a", "field_b", "field_c"],
1609          ["yellow", "黄", "12"],
1610          ["black", "黒", "0.983"],
1611          ["blue", "青", "12"],
1612          ["white", "白", "1.65"],
1613          ["green", "緑", "0.0072"],
1614          ["red", "赤", "23.8"]];
1615 
1616     string[][] data3x6ExpectedPermuteSwap =
1617         [["field_a", "field_b", "field_c"],
1618          ["black", "黒", "0.983"],
1619          ["green", "緑", "0.0072"],
1620          ["red", "赤", "23.8"],
1621          ["yellow", "黄", "12"],
1622          ["white", "白", "1.65"],
1623          ["blue", "青", "12"]];
1624 
1625     string[][] data3x6ExpectedPermuteCompatProbs =
1626         [["random_value", "field_a", "field_b", "field_c"],
1627          ["0.96055546286515892", "yellow", "黄", "12"],
1628          ["0.7571015392895788", "black", "黒", "0.983"],
1629          ["0.52525980887003243", "blue", "青", "12"],
1630          ["0.49287854949943721", "white", "白", "1.65"],
1631          ["0.15929344086907804", "green", "緑", "0.0072"],
1632          ["0.010968807619065046", "red", "赤", "23.8"]];
1633 
1634     /* Note: data3x6ExpectedAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because
1635      * both are effectively the same algorithm given that --num is data length. Both read
1636      * in the full data in order then call randomShuffle.
1637      */
1638     string[][] data3x6ExpectedPermuteAlgoRNum6 =
1639         [["field_a", "field_b", "field_c"],
1640          ["black", "黒", "0.983"],
1641          ["green", "緑", "0.0072"],
1642          ["red", "赤", "23.8"],
1643          ["yellow", "黄", "12"],
1644          ["white", "白", "1.65"],
1645          ["blue", "青", "12"]];
1646 
1647     string[][] data3x6ExpectedPermuteAlgoRNum5 =
1648         [["field_a", "field_b", "field_c"],
1649          ["red", "赤", "23.8"],
1650          ["black", "黒", "0.983"],
1651          ["white", "白", "1.65"],
1652          ["green", "緑", "0.0072"],
1653          ["yellow", "黄", "12"]];
1654 
1655     string[][] data3x6ExpectedPermuteAlgoRNum4 =
1656         [["field_a", "field_b", "field_c"],
1657          ["blue", "青", "12"],
1658          ["green", "緑", "0.0072"],
1659          ["black", "黒", "0.983"],
1660          ["white", "白", "1.65"]];
1661 
1662     string[][] data3x6ExpectedPermuteAlgoRNum3 =
1663         [["field_a", "field_b", "field_c"],
1664          ["red", "赤", "23.8"],
1665          ["black", "黒", "0.983"],
1666          ["green", "緑", "0.0072"]];
1667 
1668     string[][] data3x6ExpectedPermuteAlgoRNum2 =
1669         [["field_a", "field_b", "field_c"],
1670          ["black", "黒", "0.983"],
1671          ["red", "赤", "23.8"]];
1672 
1673     string[][] data3x6ExpectedPermuteAlgoRNum1 =
1674         [["field_a", "field_b", "field_c"],
1675          ["green", "緑", "0.0072"]];
1676 
1677     string[][] data3x6ExpectedBernoulliProbsP100 =
1678         [["random_value", "field_a", "field_b", "field_c"],
1679          ["0.010968807619065046", "red", "赤", "23.8"],
1680          ["0.15929344086907804", "green", "緑", "0.0072"],
1681          ["0.49287854949943721", "white", "白", "1.65"],
1682          ["0.96055546286515892", "yellow", "黄", "12"],
1683          ["0.52525980887003243", "blue", "青", "12"],
1684          ["0.7571015392895788", "black", "黒", "0.983"]];
1685 
1686     string[][] data3x6ExpectedBernoulliCompatProbsP60 =
1687         [["random_value", "field_a", "field_b", "field_c"],
1688          ["0.010968807619065046", "red", "赤", "23.8"],
1689          ["0.15929344086907804", "green", "緑", "0.0072"],
1690          ["0.49287854949943721", "white", "白", "1.65"],
1691          ["0.52525980887003243", "blue", "青", "12"]];
1692 
1693     string[][] data3x6ExpectedBernoulliSkipP40 =
1694         [["field_a", "field_b", "field_c"],
1695          ["red", "赤", "23.8"],
1696          ["green", "緑", "0.0072"],
1697          ["yellow", "黄", "12"]];
1698 
1699     string[][] data3x6ExpectedBernoulliCompatP60 =
1700         [["field_a", "field_b", "field_c"],
1701          ["red", "赤", "23.8"],
1702          ["green", "緑", "0.0072"],
1703          ["white", "白", "1.65"],
1704          ["blue", "青", "12"]];
1705 
1706     string[][] data3x6ExpectedDistinctK1K3P60 =
1707         [["field_a", "field_b", "field_c"],
1708          ["green", "緑", "0.0072"],
1709          ["white", "白", "1.65"],
1710          ["blue", "青", "12"]];
1711 
1712     string[][] data3x6ExpectedDistinctK1K3P60Probs =
1713         [["random_value", "field_a", "field_b", "field_c"],
1714          ["0", "green", "緑", "0.0072"],
1715          ["0", "white", "白", "1.65"],
1716          ["0", "blue", "青", "12"]];
1717 
1718     string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom =
1719         [["custom_random_value_header", "field_a", "field_b", "field_c"],
1720          ["0", "green", "緑", "0.0072"],
1721          ["0", "white", "白", "1.65"],
1722          ["0", "blue", "青", "12"]];
1723 
1724     string[][] data3x6ExpectedDistinctK2P2ProbsInorder =
1725         [["random_value", "field_a", "field_b", "field_c"],
1726          ["1", "red", "赤", "23.8"],
1727          ["0", "green", "緑", "0.0072"],
1728          ["0", "white", "白", "1.65"],
1729          ["1", "yellow", "黄", "12"],
1730          ["3", "blue", "青", "12"],
1731          ["2", "black", "黒", "0.983"]];
1732 
1733     string[][] data3x6ExpectedPermuteWt3Probs =
1734         [["random_value", "field_a", "field_b", "field_c"],
1735          ["0.9966519875764539", "yellow", "黄", "12"],
1736          ["0.94775884809836686", "blue", "青", "12"],
1737          ["0.82728234682286661", "red", "赤", "23.8"],
1738          ["0.75346697377181959", "black", "黒", "0.983"],
1739          ["0.65130103496422487", "white", "白", "1.65"],
1740          ["1.5636943712879866e-111", "green", "緑", "0.0072"]];
1741 
1742     string[][] data3x6ExpectedWt3ProbsInorder =
1743         [["random_value", "field_a", "field_b", "field_c"],
1744          ["0.82728234682286661", "red", "赤", "23.8"],
1745          ["1.5636943712879866e-111", "green", "緑", "0.0072"],
1746          ["0.65130103496422487", "white", "白", "1.65"],
1747          ["0.9966519875764539", "yellow", "黄", "12"],
1748          ["0.94775884809836686", "blue", "青", "12"],
1749          ["0.75346697377181959", "black", "黒", "0.983"]];
1750 
1751     string[][] data3x6ExpectedPermuteWt3 =
1752         [["field_a", "field_b", "field_c"],
1753          ["yellow", "黄", "12"],
1754          ["blue", "青", "12"],
1755          ["red", "赤", "23.8"],
1756          ["black", "黒", "0.983"],
1757          ["white", "白", "1.65"],
1758          ["green", "緑", "0.0072"]];
1759 
1760     string[][] data3x6ExpectedReplaceNum10 =
1761         [["field_a", "field_b", "field_c"],
1762          ["black", "黒", "0.983"],
1763          ["green", "緑", "0.0072"],
1764          ["green", "緑", "0.0072"],
1765          ["red", "赤", "23.8"],
1766          ["yellow", "黄", "12"],
1767          ["red", "赤", "23.8"],
1768          ["white", "白", "1.65"],
1769          ["yellow", "黄", "12"],
1770          ["yellow", "黄", "12"],
1771          ["white", "白", "1.65"],
1772         ];
1773 
1774     string[][] data3x6ExpectedReplaceNum10V77 =
1775         [["field_a", "field_b", "field_c"],
1776          ["black", "黒", "0.983"],
1777          ["red", "赤", "23.8"],
1778          ["black", "黒", "0.983"],
1779          ["yellow", "黄", "12"],
1780          ["green", "緑", "0.0072"],
1781          ["green", "緑", "0.0072"],
1782          ["green", "緑", "0.0072"],
1783          ["yellow", "黄", "12"],
1784          ["blue", "青", "12"],
1785          ["white", "白", "1.65"],
1786         ];
1787 
1788     /* Using a different static seed. */
1789     string[][] data3x6ExpectedPermuteCompatV41Probs =
1790         [["random_value", "field_a", "field_b", "field_c"],
1791          ["0.68057272653095424", "green", "緑", "0.0072"],
1792          ["0.67681624367833138", "blue", "青", "12"],
1793          ["0.32097338931635022", "yellow", "黄", "12"],
1794          ["0.25092361867427826", "red", "赤", "23.8"],
1795          ["0.15535934292711318", "black", "黒", "0.983"],
1796          ["0.04609582107514143", "white", "白", "1.65"]];
1797 
1798     string[][] data3x6ExpectedBernoulliCompatP60V41Probs =
1799         [["random_value", "field_a", "field_b", "field_c"],
1800          ["0.25092361867427826", "red", "赤", "23.8"],
1801          ["0.04609582107514143", "white", "白", "1.65"],
1802          ["0.32097338931635022", "yellow", "黄", "12"],
1803          ["0.15535934292711318", "black", "黒", "0.983"]];
1804 
1805     string[][] data3x6ExpectedPermuteWt3V41Probs =
1806         [["random_value", "field_a", "field_b", "field_c"],
1807          ["0.96799377498910666", "blue", "青", "12"],
1808          ["0.94356245792573568", "red", "赤", "23.8"],
1809          ["0.90964601024271996", "yellow", "黄", "12"],
1810          ["0.15491658409260103", "white", "白", "1.65"],
1811          ["0.15043620392537033", "black", "黒", "0.983"],
1812          ["6.1394674830701461e-24", "green", "緑", "0.0072"]];
1813 
1814     string[][] data3x6ExpectedWt3V41ProbsInorder =
1815         [["random_value", "field_a", "field_b", "field_c"],
1816          ["0.94356245792573568", "red", "赤", "23.8"],
1817          ["6.1394674830701461e-24", "green", "緑", "0.0072"],
1818          ["0.15491658409260103", "white", "白", "1.65"],
1819          ["0.90964601024271996", "yellow", "黄", "12"],
1820          ["0.96799377498910666", "blue", "青", "12"],
1821          ["0.15043620392537033", "black", "黒", "0.983"]];
1822 
1823 
1824     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
1825     string[][] combo1ExpectedPermuteCompat =
1826         [["field_a", "field_b", "field_c"],
1827          ["yellow", "黄", "12"],
1828          ["tan", "タン", "8.5"],
1829          ["brown", "褐色", "29.2"],
1830          ["green", "緑", "0.0072"],
1831          ["red", "赤", "23.8"],
1832          ["purple", "紫の", "42"],
1833          ["black", "黒", "0.983"],
1834          ["white", "白", "1.65"],
1835          ["gray", "グレー", "6.2"],
1836          ["blue", "青", "12"],
1837          ["pink", "ピンク", "1.1"],
1838          ["orange", "オレンジ", "2.5"]];
1839 
1840     string[][] combo1ExpectedPermuteCompatProbs =
1841         [["random_value", "field_a", "field_b", "field_c"],
1842          ["0.97088520275428891", "yellow", "黄", "12"],
1843          ["0.96055546286515892", "tan", "タン", "8.5"],
1844          ["0.81756894313730299", "brown", "褐色", "29.2"],
1845          ["0.7571015392895788", "green", "緑", "0.0072"],
1846          ["0.52525980887003243", "red", "赤", "23.8"],
1847          ["0.49287854949943721", "purple", "紫の", "42"],
1848          ["0.47081507067196071", "black", "黒", "0.983"],
1849          ["0.38388182921335101", "white", "白", "1.65"],
1850          ["0.29215990612283349", "gray", "グレー", "6.2"],
1851          ["0.24033216014504433", "blue", "青", "12"],
1852          ["0.15929344086907804", "pink", "ピンク", "1.1"],
1853          ["0.010968807619065046", "orange", "オレンジ", "2.5"]];
1854 
1855     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
1856     string[][] combo1ExpectedProbsInorder =
1857         [["random_value", "field_a", "field_b", "field_c"],
1858          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
1859          ["0.15929344086907804", "pink", "ピンク", "1.1"],
1860          ["0.49287854949943721", "purple", "紫の", "42"],
1861          ["0.96055546286515892", "tan", "タン", "8.5"],
1862          ["0.52525980887003243", "red", "赤", "23.8"],
1863          ["0.7571015392895788", "green", "緑", "0.0072"],
1864          ["0.38388182921335101", "white", "白", "1.65"],
1865          ["0.97088520275428891", "yellow", "黄", "12"],
1866          ["0.24033216014504433", "blue", "青", "12"],
1867          ["0.47081507067196071", "black", "黒", "0.983"],
1868          ["0.81756894313730299", "brown", "褐色", "29.2"],
1869          ["0.29215990612283349", "gray", "グレー", "6.2"]];
1870 
1871     string[][] combo1ExpectedBernoulliCompatP50Probs =
1872         [["random_value", "field_a", "field_b", "field_c"],
1873          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
1874          ["0.15929344086907804", "pink", "ピンク", "1.1"],
1875          ["0.49287854949943721", "purple", "紫の", "42"],
1876          ["0.38388182921335101", "white", "白", "1.65"],
1877          ["0.24033216014504433", "blue", "青", "12"],
1878          ["0.47081507067196071", "black", "黒", "0.983"],
1879          ["0.29215990612283349", "gray", "グレー", "6.2"]];
1880 
1881     string[][] combo1ExpectedBernoulliCompatP40 =
1882         [["field_a", "field_b", "field_c"],
1883          ["orange", "オレンジ", "2.5"],
1884          ["pink", "ピンク", "1.1"],
1885          ["white", "白", "1.65"],
1886          ["blue", "青", "12"],
1887          ["gray", "グレー", "6.2"]];
1888 
1889     string[][] combo1ExpectedDistinctK1P40 =
1890         [["field_a", "field_b", "field_c"],
1891          ["orange", "オレンジ", "2.5"],
1892          ["red", "赤", "23.8"],
1893          ["green", "緑", "0.0072"],
1894          ["blue", "青", "12"],
1895          ["black", "黒", "0.983"]];
1896 
1897     string[][] combo1ExpectedPermuteWt3Probs =
1898         [["random_value", "field_a", "field_b", "field_c"],
1899          ["0.99754077523718754", "yellow", "黄", "12"],
1900          ["0.99527665440088786", "tan", "タン", "8.5"],
1901          ["0.99312578945741659", "brown", "褐色", "29.2"],
1902          ["0.98329602553389361", "purple", "紫の", "42"],
1903          ["0.9733096193808366", "red", "赤", "23.8"],
1904          ["0.88797551521739648", "blue", "青", "12"],
1905          ["0.81999230489041786", "gray", "グレー", "6.2"],
1906          ["0.55975569204250941", "white", "白", "1.65"],
1907          ["0.46472135609205739", "black", "黒", "0.983"],
1908          ["0.18824582704191337", "pink", "ピンク", "1.1"],
1909          ["0.1644613185329992", "orange", "オレンジ", "2.5"],
1910          ["1.6438086931020549e-17", "green", "緑", "0.0072"]];
1911 
1912     string[][] combo1ExpectedPermuteWt3 =
1913         [["field_a", "field_b", "field_c"],
1914          ["yellow", "黄", "12"],
1915          ["tan", "タン", "8.5"],
1916          ["brown", "褐色", "29.2"],
1917          ["purple", "紫の", "42"],
1918          ["red", "赤", "23.8"],
1919          ["blue", "青", "12"],
1920          ["gray", "グレー", "6.2"],
1921          ["white", "白", "1.65"],
1922          ["black", "黒", "0.983"],
1923          ["pink", "ピンク", "1.1"],
1924          ["orange", "オレンジ", "2.5"],
1925          ["green", "緑", "0.0072"]];
1926 
1927         string[][] combo1ExpectedPermuteAlgoRNum4 =
1928         [["field_a", "field_b", "field_c"],
1929          ["blue", "青", "12"],
1930          ["gray", "グレー", "6.2"],
1931          ["brown", "褐色", "29.2"],
1932          ["white", "白", "1.65"]];
1933 
1934     string[][] combo1ExpectedReplaceNum10 =
1935         [["field_a", "field_b", "field_c"],
1936          ["gray", "グレー", "6.2"],
1937          ["yellow", "黄", "12"],
1938          ["yellow", "黄", "12"],
1939          ["white", "白", "1.65"],
1940          ["tan", "タン", "8.5"],
1941          ["white", "白", "1.65"],
1942          ["blue", "青", "12"],
1943          ["black", "黒", "0.983"],
1944          ["tan", "タン", "8.5"],
1945          ["purple", "紫の", "42"]];
1946 
1947     /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */
1948     string[][] data1x200 =
1949         [["field_a"],
1950          ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"],
1951          ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"],
1952          ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"],
1953          ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"],
1954          ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"],
1955          ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"],
1956          ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"],
1957          ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"],
1958          ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"],
1959          ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"],
1960          ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"],
1961          ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"],
1962          ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"],
1963          ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"],
1964          ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"],
1965          ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"],
1966          ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"],
1967          ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"],
1968          ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"],
1969          ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"],
1970         ];
1971 
1972     string fpath_data1x200 = buildPath(testDir, "data1x200.tsv");
1973     string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv");
1974     writeUnittestTsvFile(fpath_data1x200, data1x200);
1975     writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1..$]);
1976 
1977     string[][] data1x200ExpectedBernoulliSkipV333P01 =
1978         [["field_a"],
1979          ["077"],
1980          ["119"]];
1981 
1982     string[][] data1x200ExpectedBernoulliSkipV333P02 =
1983         [["field_a"],
1984          ["038"],
1985          ["059"],
1986          ["124"],
1987          ["161"],
1988          ["162"],
1989          ["183"]];
1990 
1991     string[][] data1x200ExpectedBernoulliSkipV333P03 =
1992         [["field_a"],
1993          ["025"],
1994          ["039"],
1995          ["082"],
1996          ["107"],
1997          ["108"],
1998          ["122"],
1999          ["136"],
2000          ["166"],
2001          ["182"]];
2002 
2003     string[][] data1x200ExpectedBernoulliCompatV333P01 =
2004         [["field_a"],
2005          ["072"]];
2006 
2007     string[][] data1x200ExpectedBernoulliCompatV333P02 =
2008         [["field_a"],
2009          ["004"],
2010          ["072"]];
2011 
2012     string[][] data1x200ExpectedBernoulliCompatV333P03 =
2013         [["field_a"],
2014          ["004"],
2015          ["072"],
2016          ["181"]];
2017 
2018     /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files,
2019      * only expected results. The header is from 3x0, the results are offset 1-position
2020      * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line.
2021      */
2022     string[][] combo2ExpectedBernoulliSkipV333P03 =
2023         [["field_a", "field_b", "field_c"],
2024          ["024"],
2025          ["038"],
2026          ["081"],
2027          ["106"],
2028          ["107"],
2029          ["121"],
2030          ["135"],
2031          ["165"],
2032          ["181"]];
2033 
2034 
2035     /* 1x10 - Simple 1-column file. */
2036     string[][] data1x10 =
2037         [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]];
2038     string fpath_data1x10 = buildPath(testDir, "data1x10.tsv");
2039     string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv");
2040     writeUnittestTsvFile(fpath_data1x10, data1x10);
2041     writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1..$]);
2042 
2043     string[][] data1x10ExpectedPermuteCompat =
2044         [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]];
2045 
2046     string[][] data1x10ExpectedPermuteWt1 =
2047         [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]];
2048 
2049     /* 2x10a - Uniform distribution [0,1]. */
2050     string[][] data2x10a =
2051         [["line", "weight"],
2052          ["1", "0.26788837"],
2053          ["2", "0.06601298"],
2054          ["3", "0.38627527"],
2055          ["4", "0.47379424"],
2056          ["5", "0.02966641"],
2057          ["6", "0.05636231"],
2058          ["7", "0.70529242"],
2059          ["8", "0.91836862"],
2060          ["9", "0.99103720"],
2061          ["10", "0.31401740"]];
2062 
2063     string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv");
2064     writeUnittestTsvFile(fpath_data2x10a, data2x10a);
2065 
2066     string[][] data2x10aExpectedPermuteWt2Probs =
2067         [["random_value", "line", "weight"],
2068          ["0.96833865494543658", "8", "0.91836862"],
2069          ["0.91856842054413923", "4", "0.47379424"],
2070          ["0.25730832087795091", "7", "0.70529242"],
2071          ["0.2372531790701812", "9", "0.99103720"],
2072          ["0.16016096701872204", "3", "0.38627527"],
2073          ["0.090819662667243381", "10", "0.31401740"],
2074          ["0.0071764539244361172", "6", "0.05636231"],
2075          ["4.8318642951630057e-08", "1", "0.26788837"],
2076          ["3.7525692966535517e-10", "5", "0.02966641"],
2077          ["8.2123247880095796e-13", "2", "0.06601298"]];
2078 
2079     /* 2x10b - Uniform distribution [0,1000]. */
2080     string[][] data2x10b =
2081         [["line", "weight"],
2082          ["1", "761"],
2083          ["2", "432"],
2084          ["3", "103"],
2085          ["4", "448"],
2086          ["5", "750"],
2087          ["6", "711"],
2088          ["7", "867"],
2089          ["8", "841"],
2090          ["9", "963"],
2091          ["10", "784"]];
2092 
2093     string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv");
2094     writeUnittestTsvFile(fpath_data2x10b, data2x10b);
2095 
2096     string[][] data2x10bExpectedPermuteWt2Probs =
2097         [["random_value", "line", "weight"],
2098          ["0.99996486739067969", "8", "841"],
2099          ["0.99991017467137211", "4", "448"],
2100          ["0.99960871524873662", "6", "711"],
2101          ["0.999141885371438", "5", "750"],
2102          ["0.99903963250274785", "10", "784"],
2103          ["0.99889631825931946", "7", "867"],
2104          ["0.99852058315191139", "9", "963"],
2105          ["0.99575669679158918", "2", "432"],
2106          ["0.99408758732050595", "1", "761"],
2107          ["0.99315467761212362", "3", "103"]];
2108 
2109     /* 2x10c - Logarithmic distribution in random order. */
2110     string[][] data2x10c =
2111         [["line", "weight"],
2112          ["1", "31.85"],
2113          ["2", "17403.31"],
2114          ["3", "653.84"],
2115          ["4", "8.23"],
2116          ["5", "2671.04"],
2117          ["6", "26226.08"],
2118          ["7", "1.79"],
2119          ["8", "354.56"],
2120          ["9", "35213.81"],
2121          ["10", "679.29"]];
2122 
2123     string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv");
2124     writeUnittestTsvFile(fpath_data2x10c, data2x10c);
2125 
2126     string[][] data2x10cExpectedPermuteWt2Probs =
2127         [["random_value", "line", "weight"],
2128          ["0.99998939008709697", "6", "26226.08"],
2129          ["0.99995951291695517", "9", "35213.81"],
2130          ["0.99991666907613541", "8", "354.56"],
2131          ["0.9998944505218641", "2", "17403.31"],
2132          ["0.9997589760286163", "5", "2671.04"],
2133          ["0.99891852769877643", "3", "653.84"],
2134          ["0.99889167752782515", "10", "679.29"],
2135          ["0.99512207506850148", "4", "8.23"],
2136          ["0.86789371584259023", "1", "31.85"],
2137          ["0.5857443816291561", "7", "1.79"]];
2138 
2139     /* 2x10d. Logarithmic distribution in ascending order. */
2140     string[][] data2x10d =
2141         [["line", "weight"],
2142          ["1", "1.79"],
2143          ["2", "8.23"],
2144          ["3", "31.85"],
2145          ["4", "354.56"],
2146          ["5", "653.84"],
2147          ["6", "679.29"],
2148          ["7", "2671.04"],
2149          ["8", "17403.31"],
2150          ["9", "26226.08"],
2151          ["10", "35213.81"]];
2152 
2153     string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv");
2154     writeUnittestTsvFile(fpath_data2x10d, data2x10d);
2155 
2156     string[][] data2x10dExpectedPermuteWt2Probs =
2157         [["random_value", "line", "weight"],
2158          ["0.99999830221846353", "8", "17403.31"],
2159          ["0.99997860834041397", "10", "35213.81"],
2160          ["0.99994563828986716", "9", "26226.08"],
2161          ["0.99988650363575737", "4", "354.56"],
2162          ["0.99964161939190088", "7", "2671.04"],
2163          ["0.99959045338948649", "6", "679.29"],
2164          ["0.99901574490639788", "5", "653.84"],
2165          ["0.97803163304747431", "3", "31.85"],
2166          ["0.79994791806910948", "2", "8.23"],
2167          ["0.080374261239949119", "1", "1.79"]];
2168 
2169     /* 2x10e. Logarithmic distribution in descending order. */
2170     string[][] data2x10e =
2171         [["line", "weight"],
2172          ["1", "35213.81"],
2173          ["2", "26226.08"],
2174          ["3", "17403.31"],
2175          ["4", "2671.04"],
2176          ["5", "679.29"],
2177          ["6", "653.84"],
2178          ["7", "354.56"],
2179          ["8", "31.85"],
2180          ["9", "8.23"],
2181          ["10", "1.79"]];
2182     string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv");
2183     writeUnittestTsvFile(fpath_data2x10e, data2x10e);
2184 
2185     string[][] data2x10eExpectedPermuteWt2Probs =
2186         [["random_value", "line", "weight"],
2187          ["0.99998493348975237", "4", "2671.04"],
2188          ["0.99995934807202624", "3", "17403.31"],
2189          ["0.99992995739727453", "2", "26226.08"],
2190          ["0.99987185679245649", "1", "35213.81"],
2191          ["0.99957451563173938", "6", "653.84"],
2192          ["0.99907273650209583", "8", "31.85"],
2193          ["0.99905260312968946", "5", "679.29"],
2194          ["0.99730333650516401", "7", "354.56"],
2195          ["0.84093902435227808", "9", "8.23"],
2196          ["0.65650015926290028", "10", "1.79"]];
2197 
2198     /* Data sets for distinct sampling. */
2199     string[][] data5x25 =
2200         [["ID", "Shape", "Color", "Size", "Weight"],
2201          ["01", "circle", "red", "S", "10"],
2202          ["02", "circle", "black", "L", "20"],
2203          ["03", "square", "black", "L", "20"],
2204          ["04", "circle", "green", "L", "30"],
2205          ["05", "ellipse", "red", "S", "20"],
2206          ["06", "triangle", "red", "S", "10"],
2207          ["07", "triangle", "red", "L", "20"],
2208          ["08", "square", "black", "S", "10"],
2209          ["09", "circle", "black", "S", "20"],
2210          ["10", "square", "green", "L", "20"],
2211          ["11", "triangle", "red", "L", "20"],
2212          ["12", "circle", "green", "L", "30"],
2213          ["13", "ellipse", "red", "S", "20"],
2214          ["14", "circle", "green", "L", "30"],
2215          ["15", "ellipse", "red", "L", "30"],
2216          ["16", "square", "red", "S", "10"],
2217          ["17", "circle", "black", "L", "20"],
2218          ["18", "square", "red", "S", "20"],
2219          ["19", "square", "black", "L", "20"],
2220          ["20", "circle", "red", "S", "10"],
2221          ["21", "ellipse", "black", "L", "30"],
2222          ["22", "triangle", "red", "L", "30"],
2223          ["23", "circle", "green", "S", "20"],
2224          ["24", "square", "green", "L", "20"],
2225          ["25", "circle", "red", "S", "10"],
2226         ];
2227 
2228     string fpath_data5x25 = buildPath(testDir, "data5x25.tsv");
2229     string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv");
2230     writeUnittestTsvFile(fpath_data5x25, data5x25);
2231     writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1..$]);
2232 
2233     string[][] data5x25ExpectedDistinctK2P40 =
2234         [["ID", "Shape", "Color", "Size", "Weight"],
2235          ["03", "square", "black", "L", "20"],
2236          ["05", "ellipse", "red", "S", "20"],
2237          ["08", "square", "black", "S", "10"],
2238          ["10", "square", "green", "L", "20"],
2239          ["13", "ellipse", "red", "S", "20"],
2240          ["15", "ellipse", "red", "L", "30"],
2241          ["16", "square", "red", "S", "10"],
2242          ["18", "square", "red", "S", "20"],
2243          ["19", "square", "black", "L", "20"],
2244          ["21", "ellipse", "black", "L", "30"],
2245          ["24", "square", "green", "L", "20"],
2246         ];
2247 
2248     string[][] data5x25ExpectedDistinctK2K4P20 =
2249         [["ID", "Shape", "Color", "Size", "Weight"],
2250          ["03", "square", "black", "L", "20"],
2251          ["07", "triangle", "red", "L", "20"],
2252          ["08", "square", "black", "S", "10"],
2253          ["10", "square", "green", "L", "20"],
2254          ["11", "triangle", "red", "L", "20"],
2255          ["16", "square", "red", "S", "10"],
2256          ["18", "square", "red", "S", "20"],
2257          ["19", "square", "black", "L", "20"],
2258          ["22", "triangle", "red", "L", "30"],
2259          ["24", "square", "green", "L", "20"],
2260         ];
2261 
2262     string[][] data5x25ExpectedDistinctK2K3K4P20 =
2263         [["ID", "Shape", "Color", "Size", "Weight"],
2264          ["04", "circle", "green", "L", "30"],
2265          ["07", "triangle", "red", "L", "20"],
2266          ["09", "circle", "black", "S", "20"],
2267          ["11", "triangle", "red", "L", "20"],
2268          ["12", "circle", "green", "L", "30"],
2269          ["14", "circle", "green", "L", "30"],
2270          ["16", "square", "red", "S", "10"],
2271          ["18", "square", "red", "S", "20"],
2272          ["22", "triangle", "red", "L", "30"],
2273         ];
2274 
2275     /*
2276      * Enough setup! Actually run some tests!
2277      */
2278 
2279     /* Permutations. Headers, static seed, compatibility mode. With weights and without. */
2280     testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty);
2281     testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0);
2282     testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1);
2283     testTsvSample(["test-a4", "-H", "-s", "--compatibility-mode", fpath_data3x2], data3x2PermuteCompat);
2284     testTsvSample(["test-a5", "-H", "-s", "--compatibility-mode", fpath_data3x3], data3x3ExpectedPermuteCompat);
2285     testTsvSample(["test-a6", "-H", "-s", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat);
2286     testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
2287     testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3);
2288     testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
2289     testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
2290     testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
2291     testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
2292     testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs);
2293 
2294     /* Permutations, without compatibility mode, or with both compatibility and printing. */
2295     testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty);
2296     testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0);
2297     testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1);
2298     testTsvSample(["test-aa4", "-H", "-s", fpath_data3x2], data3x2PermuteShuffle);
2299     testTsvSample(["test-aa5", "-H", "-s", fpath_data3x3], data3x3ExpectedPermuteSwap);
2300     testTsvSample(["test-aa6", "-H", "-s", fpath_data3x6], data3x6ExpectedPermuteSwap);
2301     testTsvSample(["test-aa7", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3);
2302     testTsvSample(["test-aa8", "-H", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
2303     testTsvSample(["test-aa9", "-H", "--seed-value", "41", "--print-random", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
2304 
2305     /* Reservoir sampling using Algorithm R.
2306      * (Note: reservoirSamplingViaHeap is tested later in the length-based iteration loops.)
2307      */
2308     testTsvSample(["test-aa10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty);
2309     testTsvSample(["test-aa11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty);
2310     testTsvSample(["test-aa12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x0], data3x0);
2311     testTsvSample(["test-aa13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x0], data3x0);
2312     testTsvSample(["test-aa14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x1], data3x1);
2313     testTsvSample(["test-aa15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x1], data3x1);
2314     testTsvSample(["test-aa16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum6);
2315     testTsvSample(["test-aa17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum6);
2316     testTsvSample(["test-aa18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum5);
2317     testTsvSample(["test-aa19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum4);
2318     testTsvSample(["test-aa20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum3);
2319     testTsvSample(["test-aa21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum2);
2320     testTsvSample(["test-aa22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum1);
2321 
2322     /* Bernoulli sampling cases. */
2323     testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty);
2324     testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0);
2325     testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1);
2326     testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6);
2327     testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6);
2328     testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
2329     testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60);
2330     testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliCompatP60);
2331     testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs);
2332 
2333     /* Bernoulli sampling with probabilities in skip sampling range or preferring skip sampling. */
2334     testTsvSample(["test-ab1", "-H", "--seed-value", "333", "--prob", "0.01", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P01);
2335     testTsvSample(["test-ab2", "-H", "--seed-value", "333", "--prob", "0.02", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P02);
2336     testTsvSample(["test-ab3", "-H", "--seed-value", "333", "--prob", "0.03", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03);
2337     testTsvSample(["test-ab4", "-H", "--seed-value", "333", "--prob", "0.01", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P01);
2338     testTsvSample(["test-ab5", "-H", "--seed-value", "333", "--prob", "0.02", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P02);
2339     testTsvSample(["test-ab6", "-H", "--seed-value", "333", "--prob", "0.03", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P03);
2340     testTsvSample(["test-ab7", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40);
2341 
2342     /* Distinct sampling cases. */
2343     testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty);
2344     testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0);
2345     testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1);
2346     testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6);
2347     testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctK1K3P60);
2348 
2349     /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling.
2350      * For weighted sampling, use the weighted cases, but with expected using the original ordering.
2351      */
2352     testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
2353     testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
2354     testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
2355                   data3x6ExpectedWt3ProbsInorder);
2356     testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
2357                   data3x6ExpectedWt3V41ProbsInorder);
2358     testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6],
2359                   data3x6ExpectedDistinctK1K3P60Probs);
2360     testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header",
2361                    "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctK1K3P60ProbsRVCustom);
2362     testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6],
2363                   data3x6ExpectedDistinctK2P2ProbsInorder);
2364 
2365     /* Simple random sampling with replacement. */
2366     testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty);
2367     testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty);
2368     testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0);
2369     testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0);
2370     testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplaceNum3);
2371     testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10);
2372     testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77);
2373 
2374     /* Permutations, compatibility mode, without headers. */
2375     testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1..$]);
2376     testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1..$]);
2377     testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1..$]);
2378     testTsvSample(["test-b4", "-s", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..$]);
2379     testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..$]);
2380     testTsvSample(["test-b6", "-s", "--weight-field", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..$]);
2381     testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..$]);
2382     testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1..$]);
2383     testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1..$]);
2384 
2385     /* Permutations, no headers, without compatibility mode, or with printing and compatibility mode. */
2386     testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1..$]);
2387     testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1..$]);
2388     testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1..$]);
2389     testTsvSample(["test-bb4", "-s", fpath_data3x6_noheader], data3x6ExpectedPermuteSwap[1..$]);
2390     testTsvSample(["test-bb5", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..$]);
2391     testTsvSample(["test-bb6", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..$]);
2392     testTsvSample(["test-bb7", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1..$]);
2393 
2394     /* Reservoir sampling using Algorithm R, no headers. */
2395     testTsvSample(["test-aa10", "--prefer-algorithm-r", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty);
2396     testTsvSample(["test-aa11", "--prefer-algorithm-r", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty);
2397     testTsvSample(["test-aa14", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x1_noheader], data3x1[1..$]);
2398     testTsvSample(["test-aa15", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x1_noheader], data3x1[1..$]);
2399     testTsvSample(["test-aa16", "--prefer-algorithm-r", "-s", "--num", "7", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum6[1..$]);
2400     testTsvSample(["test-aa17", "--prefer-algorithm-r", "-s", "--num", "6", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum6[1..$]);
2401     testTsvSample(["test-aa18", "--prefer-algorithm-r", "-s", "--num", "5", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum5[1..$]);
2402     testTsvSample(["test-aa19", "--prefer-algorithm-r", "-s", "--num", "4", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum4[1..$]);
2403     testTsvSample(["test-aa20", "--prefer-algorithm-r", "-s", "--num", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum3[1..$]);
2404     testTsvSample(["test-aa21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum2[1..$]);
2405     testTsvSample(["test-aa22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum1[1..$]);
2406 
2407     /* Bernoulli sampling cases. */
2408     testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1..$]);
2409     testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]);
2410     testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]);
2411     testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..$]);
2412     testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..$]);
2413     testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1..$]);
2414 
2415     /* Bernoulli sampling with probabilities in skip sampling range. */
2416     testTsvSample(["test-bb1", "-v", "333", "-p", "0.01", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P01[1..$]);
2417     testTsvSample(["test-bb2", "-v", "333", "-p", "0.02", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P02[1..$]);
2418     testTsvSample(["test-bb3", "-v", "333", "-p", "0.03", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..$]);
2419     testTsvSample(["test-bb4", "-v", "333", "-p", "0.01", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P01[1..$]);
2420     testTsvSample(["test-bb5", "-v", "333", "-p", "0.02", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P02[1..$]);
2421     testTsvSample(["test-bb6", "-v", "333", "-p", "0.03", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P03[1..$]);
2422     testTsvSample(["test-bb7", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1..$]);
2423 
2424     /* Distinct sampling cases. */
2425     testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1..$]);
2426     testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]);
2427     testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]);
2428     testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]);
2429 
2430     /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */
2431     testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..$]);
2432     testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1..$]);
2433     testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader],
2434                   data3x6ExpectedDistinctK1K3P60Probs[1..$]);
2435     testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader],
2436                   data3x6ExpectedDistinctK2P2ProbsInorder[1..$]);
2437 
2438     /* Simple random sampling with replacement. */
2439     testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty);
2440     testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty);
2441     testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplaceNum3[1..$]);
2442     testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10[1..$]);
2443     testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1..$]);
2444 
2445     /* Multi-file tests. */
2446     testTsvSample(["test-c1", "--header", "--static-seed", "--compatibility-mode",
2447                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2448                   combo1ExpectedPermuteCompat);
2449     testTsvSample(["test-c2", "--header", "--static-seed", "--print-random",
2450                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2451                   combo1ExpectedPermuteCompatProbs);
2452     testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3",
2453                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2454                   combo1ExpectedPermuteWt3Probs);
2455     testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", "--compatibility-mode",
2456                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2457                   combo1ExpectedPermuteWt3);
2458     testTsvSample(["test-c5", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4",
2459                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2460                   combo1ExpectedPermuteAlgoRNum4);
2461 
2462     /* Multi-file, no headers. */
2463     testTsvSample(["test-c6", "--static-seed", "--compatibility-mode",
2464                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2465                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2466                   combo1ExpectedPermuteCompat[1..$]);
2467     testTsvSample(["test-c7", "--static-seed", "--print-random",
2468                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2469                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2470                   combo1ExpectedPermuteCompatProbs[1..$]);
2471     testTsvSample(["test-c8", "--static-seed", "--print-random", "--weight-field", "3",
2472                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2473                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2474                   combo1ExpectedPermuteWt3Probs[1..$]);
2475     testTsvSample(["test-c9", "--static-seed", "--weight-field", "3", "--compatibility-mode",
2476                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2477                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2478                   combo1ExpectedPermuteWt3[1..$]);
2479     testTsvSample(["test-c10", "--static-seed", "--prefer-algorithm-r", "--num", "4",
2480                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2481                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2482                   combo1ExpectedPermuteAlgoRNum4[1..$]);
2483 
2484     /* Bernoulli sampling cases. */
2485     testTsvSample(["test-c11", "--header", "--static-seed", "--print-random", "--prob", ".5",
2486                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2487                   combo1ExpectedBernoulliCompatP50Probs);
2488     testTsvSample(["test-c12", "--header", "--static-seed", "--prob", ".4",
2489                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2490                   combo1ExpectedBernoulliCompatP40);
2491     testTsvSample(["test-c13", "--static-seed", "--print-random", "--prob", ".5",
2492                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2493                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2494                   combo1ExpectedBernoulliCompatP50Probs[1..$]);
2495     testTsvSample(["test-c14", "--static-seed", "--prob", ".4",
2496                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2497                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2498                   combo1ExpectedBernoulliCompatP40[1..$]);
2499 
2500     /* Bernoulli sampling with probabilities in skip sampling range. */
2501     testTsvSample(["test-cc1", "-H", "-v", "333", "-p", "0.03",
2502                    fpath_data3x0, fpath_data3x1, fpath_data1x200, fpath_dataEmpty, fpath_data1x10],
2503                   combo2ExpectedBernoulliSkipV333P03);
2504     testTsvSample(["test-cc1", "-v", "333", "-p", "0.03",
2505                    fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader],
2506                   combo2ExpectedBernoulliSkipV333P03[1..$]);
2507 
2508     /* Distinct sampling cases. */
2509     testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4",
2510                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2511                   combo1ExpectedDistinctK1P40);
2512     testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--prob", ".4",
2513                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2514                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2515                   combo1ExpectedDistinctK1P40[1..$]);
2516 
2517     /* Generating random weights. */
2518     testTsvSample(["test-c15", "--header", "--static-seed", "--gen-random-inorder",
2519                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2520                   combo1ExpectedProbsInorder);
2521     testTsvSample(["test-c16", "--static-seed", "--gen-random-inorder",
2522                    fpath_data3x3_noheader, fpath_data3x1_noheader,
2523                    fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader],
2524                   combo1ExpectedProbsInorder[1..$]);
2525 
2526     /* Simple random sampling with replacement. */
2527     testTsvSample(["test-c17", "--header", "--static-seed", "--replace", "--num", "10",
2528                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2529                   combo1ExpectedReplaceNum10);
2530 
2531     testTsvSample(["test-c18", "--static-seed", "--replace", "--num", "10",
2532                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2533                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2534                   combo1ExpectedReplaceNum10[1..$]);
2535 
2536     /* Single column file. */
2537     testTsvSample(["test-d1", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat);
2538     testTsvSample(["test-d2", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat);
2539 
2540     /* Distributions. */
2541     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs);
2542     testTsvSample(["test-e2", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedPermuteWt2Probs);
2543     testTsvSample(["test-e3", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedPermuteWt2Probs);
2544     testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs);
2545     testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs);
2546 
2547     /* Tests of subset sample (--n|num) field.
2548      *
2549      * Note: The way these tests are done ensures that subset length does not affect
2550      * output order.
2551      */
2552     import std.algorithm : min;
2553     for (size_t n = data3x6.length + 2; n >= 1; n--)
2554     {
2555         /* reservoirSamplingViaHeap.
2556          */
2557         size_t expectedLength = min(data3x6.length, n + 1);
2558         testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string,
2559                        "-H", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]);
2560 
2561         testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string,
2562                        "-H", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]);
2563 
2564         testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string,
2565                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs[0..expectedLength]);
2566 
2567         testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string,
2568                        "-H", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3[0..expectedLength]);
2569 
2570         testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string,
2571                        "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs[0..expectedLength]);
2572 
2573         testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string,
2574                        fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..expectedLength]);
2575 
2576         testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string,
2577                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..expectedLength]);
2578 
2579         testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string,
2580                        "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..expectedLength]);
2581 
2582         testTsvSample([format("test-f9_%d", n), "-s", "-n", n.to!string,
2583                        "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..expectedLength]);
2584 
2585         /* Bernoulli sampling.
2586          */
2587         import std.algorithm : min;
2588         size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedBernoulliCompatProbsP60.length);
2589 
2590         testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
2591                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60[0..sampleExpectedLength]);
2592 
2593         testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
2594                        "-H", fpath_data3x6], data3x6ExpectedBernoulliCompatP60[0..sampleExpectedLength]);
2595 
2596         testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
2597                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..sampleExpectedLength]);
2598 
2599         testTsvSample([format("test-f13_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
2600                        fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60[1..sampleExpectedLength]);
2601 
2602         /* Distinct Sampling.
2603          */
2604         size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctK1K3P60.length);
2605 
2606         testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
2607                        "-H", fpath_data3x6], data3x6ExpectedDistinctK1K3P60[0..distinctExpectedLength]);
2608 
2609         testTsvSample([format("test-f15_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
2610                        fpath_data3x6_noheader], data3x6ExpectedDistinctK1K3P60[1..distinctExpectedLength]);
2611 
2612         testTsvSample([format("test-f16_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
2613                        "-H", fpath_data3x6], data3x6ExpectedBernoulliProbsP100[0..expectedLength]);
2614 
2615         testTsvSample([format("test-f17_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
2616                        fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]);
2617     }
2618 
2619     /* Similar tests with the 1x10 data set. */
2620     for (size_t n = data1x10.length + 2; n >= 1; n--)
2621     {
2622         size_t expectedLength = min(data1x10.length, n + 1);
2623         testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string,
2624                        "-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]);
2625 
2626         testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string,
2627                        "-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]);
2628 
2629         testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string,
2630                        fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]);
2631 
2632         testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string,
2633                        "-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]);
2634     }
2635 
2636     /* Simple random sampling with replacement: ensure sample size doesn't change order. */
2637     for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--)
2638     {
2639         testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6],
2640                       data3x6ExpectedReplaceNum10[0 .. n + 1]);
2641 
2642         testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader],
2643                       data3x6ExpectedReplaceNum10[1 .. n + 1]);
2644     }
2645 
2646     /* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */
2647     for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--)
2648     {
2649         size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1);
2650 
2651         testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
2652                        "-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]);
2653 
2654         testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
2655                        fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]);
2656 }
2657 
2658 
2659     /* Distinct sampling tests. */
2660     testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25],
2661                   data5x25ExpectedDistinctK2P40);
2662 
2663     testTsvSample(["test-j2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25],
2664                   data5x25ExpectedDistinctK2K4P20);
2665 
2666     testTsvSample(["test-j3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25],
2667                   data5x25ExpectedDistinctK2K3K4P20);
2668 
2669     testTsvSample(["test-j4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader],
2670                   data5x25ExpectedDistinctK2P40[1..$]);
2671 
2672     testTsvSample(["test-j5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader],
2673                   data5x25ExpectedDistinctK2K4P20[1..$]);
2674 
2675     testTsvSample(["test-j6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader],
2676                   data5x25ExpectedDistinctK2K3K4P20[1..$]);
2677 }