tsv_utils.tsv_sample source code

1 /**
2 Command line tool for randomizing or sampling lines from input streams. Several
3 sampling methods are available, including simple random sampling, weighted random
4 sampling, Bernoulli sampling, and distinct sampling.
5 
6 Copyright (c) 2017-2018, eBay Software Foundation
7 Initially written by Jon Degenhardt
8 
9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
10 */
11 module tsv_utils.tsv_sample;
12 
13 import std.range;
14 import std.stdio;
15 import std.typecons : tuple, Flag;
16 
17 version(unittest)
18 {
19     // When running unit tests, use main from -main compiler switch.
20 }
21 else
22 {
23     int main(string[] cmdArgs)
24     {
25         /* When running in DMD code coverage mode, turn on report merging. */
26         version(D_Coverage) version(DigitalMars)
27         {
28             import core.runtime : dmd_coverSetMerge;
29             dmd_coverSetMerge(true);
30         }
31 
32         TsvSampleOptions cmdopt;
33         auto r = cmdopt.processArgs(cmdArgs);
34         if (!r[0]) return r[1];
35         version(LDC_Profile)
36         {
37             import ldc.profile : resetAll;
38             resetAll();
39         }
40         try
41         {
42             import tsv_utils.common.utils : BufferedOutputRange;
43             auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
44 
45             tsvSample(cmdopt, bufferedOutput);
46         }
47         catch (Exception exc)
48         {
49             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
50             return 1;
51         }
52         return 0;
53     }
54 }
55 
56 auto helpText = q"EOS
57 Synopsis: tsv-sample [options] [file...]
58 
59 Sample input lines or randomize their order. Several modes of operation
60 are available:
61 * Line order randomization (the default): All input lines are output in a
62   random order. All orderings are equally likely.
63 * Weighted line order randomization (--w|weight-field): Lines are selected
64   using weighted random sampling, with the weight taken from a field.
65   Lines are output in weighted selection order, reordering the lines.
66 * Sampling with replacement (--r|replace, --n|num): All input is read into
67   memory, then lines are repeatedly selected at random and written out. This
68   continues until --n|num samples are output. Lines can be selected multiple
69   times. Output continues forever if --n|num is zero or not specified.
70 * Bernoulli sampling (--p|prob): A random subset of lines is output based
71   on an inclusion probability. This is a streaming operation. A selection
72   decision is made on each line as is it read. Line order is not changed.
73 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled
74   based on the values in the key field. A subset of the keys are chosen
75   based on the inclusion probability (a 'distinct' set of keys). All lines
76   with one of the selected keys are output. Line order is not changed.
77 
78 The '--n|num' option limits the sample size produced. It speeds up line
79 order randomization and weighted sampling significantly. It is also used
80 to terminate sampling with replacement.
81 
82 Use '--help-verbose' for detailed information.
83 
84 Options:
85 EOS";
86 
87 auto helpTextVerbose = q"EOS
88 Synopsis: tsv-sample [options] [file...]
89 
90 Sample input lines or randomize their order. Several modes of operation
91 are available:
92 * Line order randomization (the default): All input lines are output in a
93   random order. All orderings are equally likely.
94 * Weighted line order randomization (--w|weight-field): Lines are selected
95   using weighted random sampling, with the weight taken from a field.
96   Lines are output in weighted selection order, reordering the lines.
97 * Sampling with replacement (--r|replace, --n|num): All input is read into
98   memory, then lines are repeatedly selected at random and written out. This
99   continues until --n|num samples are output. Lines can be selected multiple
100   times. Output continues forever if --n|num is zero or not specified.
101 * Bernoulli sampling (--p|prob): A random subset of lines is output based
102   on an inclusion probability. This is a streaming operation. A selection
103   decision is made on each line as is it read. Lines order is not changed.
104 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled
105   based on the values in the key field. A subset of the keys are chosen
106   based on the inclusion probability (a 'distinct' set of keys). All lines
107   with one of the selected keys are output. Line order is not changed.
108 
109 Sample size: The '--n|num' option limits the sample size produced. This
110 speeds up line order randomization and weighted sampling significantly
111 (details below). It is also used to terminate sampling with replacement.
112 
113 Controlling the random seed: By default, each run produces a different
114 randomization or sampling. Using '--s|static-seed' changes this so
115 multiple runs produce the same results. This works by using the same
116 random seed each run. The random seed can be specified using
117 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero
118 value is a no-op and ignored.)
119 
120 Memory use: Bernoulli sampling and distinct sampling make decisions on
121 each line as it is read, so there is no memory accumulation. These
122 algorithms support arbitrary size inputs. Sampling with replacement reads
123 all lines into memory and is limited by available memory. The line order
124 randomization algorithms hold the full output set in memory prior to
125 generating results. This ultimately limits the size of the output set. For
126 these memory needs can be reduced by using a sample size (--n|num). This
127 engages reservior sampling. Output order is not affected. Both
128 'tsv-sample -n 1000' and 'tsv-sample | head -n 1000' produce the same
129 results, but the former is quite a bit faster.
130 
131 Weighted sampling: Weighted random sampling is done using an algorithm
132 described by Pavlos Efraimidis and Paul Spirakis. Weights should be
133 positive values representing the relative weight of the entry in the
134 collection. Counts and similar can be used as weights, it is *not*
135 necessary to normalize to a [0,1] interval. Negative values are not
136 meaningful and given the value zero. Input order is not retained, instead
137 lines are output ordered by the randomized weight that was assigned. This
138 means that a smaller valid sample can be produced by taking the first N
139 lines of output. For more info on the sampling approach see:
140 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling
141 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis
142   (https://arxiv.org/abs/1012.0256)
143 
144 Printing random values: Most of the sampling algorithms work by generating
145 a random value for each line. (See "Compatibility mode" below.) The nature
146 of these values depends on the sampling algorithm. They are used for both
147 line selection and output ordering. The '--p|print-random' option can be
148 used to print these values. The random value is prepended to the line
149 separated by the --d|delimiter char (TAB by default). The
150 '--q|gen-random-inorder' option takes this one step further, generating
151 random values for all input lines without changing the input order. The
152 types of values currently used by these sampling algorithms:
153 * Unweighted sampling: Uniform random value in the interval [0,1]. This
154   includes Bernoulli sampling and unweighted line order randomization.
155 * Weighted sampling: Value in the interval [0,1]. Distribution depends on
156   the values in the weight field. It is used as a partial ordering.
157 * Distinct sampling: An integer, zero and up, representing a selection
158   group. The inclusion probability determines the number of selection groups.
159 * Sampling with replacement: Random value printing is not supported.
160 
161 The specifics behind these random values are subject to change in future
162 releases.
163 
164 Compatibility mode: As described above, many of the sampling algorithms
165 assign a random value to each line. This is useful when printing random
166 values. It has another occasionally useful property: repeated runs with
167 the same static seed but different selection parameters are more
168 compatible with each other, as each line gets assigned the same random
169 value on every run. For example, if Bernoulli sampling is run with
170 '--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed',
171 all the lines selected in the first run will be selected in the second.
172 This comes at a cost: in some cases there are faster algorithms that don't
173 preserve this property. By default, tsv-sample will use faster algorithms
174 when available. However, the '--compatibility-mode' option switches to
175 algorithms that assign a random value per line. Printing random values
176 also engages compatibility mode.
177 
178 Options:
179 EOS";
180 
181 /** Container for command line options.
182  */
183 struct TsvSampleOptions
184 {
185     string programName;                        /// Program name
186     string[] files;                            /// Input files
187     bool helpVerbose = false;                  /// --help-verbose
188     bool hasHeader = false;                    /// --H|header
189     size_t sampleSize = 0;                     /// --n|num - Size of the desired sample
190     double inclusionProbability = double.nan;  /// --p|prob - Inclusion probability
191     size_t[] keyFields;                        /// --k|key-fields - Used with inclusion probability
192     size_t weightField = 0;                    /// --w|weight-field - Field holding the weight
193     bool srsWithReplacement = false;           /// --r|replace
194     bool staticSeed = false;                   /// --s|static-seed
195     uint seedValueOptionArg = 0;               /// --v|seed-value
196     bool printRandom = false;                  /// --print-random
197     bool genRandomInorder = false;             /// --gen-random-inorder
198     string randomValueHeader = "random_value"; /// --random-value-header
199     bool compatibilityMode = false;            /// --compatibility-mode
200     char delim = '\t';                         /// --d|delimiter
201     bool versionWanted = false;                /// --V|version
202     bool preferSkipSampling = false;           /// --prefer-skip-sampling
203     bool preferAlgorithmR = false;             /// --prefer-algorithm-r
204     bool hasWeightField = false;               /// Derived.
205     bool useBernoulliSampling = false;         /// Derived.
206     bool useDistinctSampling = false;          /// Derived.
207     bool distinctKeyIsFullLine = false;        /// Derived. True if '--k|key-fields 0' is specfied.
208     bool usingUnpredictableSeed = true;        /// Derived from --static-seed, --seed-value
209     uint seed = 0;                             /// Derived from --static-seed, --seed-value
210 
211     auto processArgs(ref string[] cmdArgs)
212     {
213         import std.algorithm : any, canFind, each;
214         import std.getopt;
215         import std.math : isNaN;
216         import std.path : baseName, stripExtension;
217         import std.typecons : Yes, No;
218         import tsv_utils.common.utils : makeFieldListOptionHandler;
219 
220         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
221 
222         try
223         {
224             arraySep = ",";    // Use comma to separate values in command line options
225             auto r = getopt(
226                 cmdArgs,
227                 "help-verbose",    "     Print more detailed help.", &helpVerbose,
228 
229                 std.getopt.config.caseSensitive,
230                 "H|header",        "     Treat the first line of each file as a header.", &hasHeader,
231                 std.getopt.config.caseInsensitive,
232 
233                 "n|num",           "NUM  Maximim number of lines to output. All selected lines are output if not provided or zero.", &sampleSize,
234                 "p|prob",          "NUM  Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability,
235 
236                 "k|key-fields",    "<field-list>  Fields to use as key for distinct sampling. Use with '--p|prob'. Specify '--k|key-fields 0' to use the entire line as the key.",
237                 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
238 
239                 "w|weight-field",  "NUM  Field containing weights. All lines get equal weight if not provided or zero.", &weightField,
240                 "r|replace",       "     Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement,
241                 "s|static-seed",   "     Use the same random seed every run.", &staticSeed,
242 
243                 std.getopt.config.caseSensitive,
244                 "v|seed-value",    "NUM  Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg,
245                 std.getopt.config.caseInsensitive,
246 
247                 "print-random",       "     Include the assigned random value (prepended) when writing output lines.", &printRandom,
248                 "gen-random-inorder", "     Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder,
249                 "random-value-header",  "     Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader,
250                 "compatibility-mode", "     Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode,
251 
252                 "d|delimiter",     "CHR  Field delimiter.", &delim,
253 
254                 std.getopt.config.caseSensitive,
255                 "V|version",       "     Print version information and exit.", &versionWanted,
256                 std.getopt.config.caseInsensitive,
257 
258                 "prefer-skip-sampling", "     (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.",
259                 &preferSkipSampling,
260 
261                 "prefer-algorithm-r",   "     (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.",
262                 &preferAlgorithmR,
263                 );
264 
265             if (r.helpWanted)
266             {
267                 defaultGetoptPrinter(helpText, r.options);
268                 return tuple(false, 0);
269             }
270             else if (helpVerbose)
271             {
272                 defaultGetoptPrinter(helpTextVerbose, r.options);
273                 return tuple(false, 0);
274             }
275             else if (versionWanted)
276             {
277                 import tsv_utils.common.tsvutils_version;
278                 writeln(tsvutilsVersionNotice("tsv-sample"));
279                 return tuple(false, 0);
280             }
281 
282             /* Derivations and validations. */
283             if (weightField > 0)
284             {
285                 hasWeightField = true;
286                 weightField--;    // Switch to zero-based indexes.
287             }
288 
289             if (srsWithReplacement)
290             {
291                 if (hasWeightField)
292                 {
293                     throw new Exception("Sampling with replacement (--r|replace) does not support wieghts (--w|weight-field).");
294                 }
295                 else if (!inclusionProbability.isNaN)
296                 {
297                     throw new Exception("Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob).");
298                 }
299                 else if (keyFields.length > 0)
300                 {
301                     throw new Exception("Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields).");
302                 }
303                 else if (printRandom || genRandomInorder)
304                 {
305                     throw new Exception("Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder).");
306                 }
307             }
308 
309             if (keyFields.length > 0)
310             {
311                 /* Note: useDistinctSampling is set as part of the inclusion probability checks below. */
312 
313                 if (inclusionProbability.isNaN) throw new Exception("--p|prob is required when using --k|key-fields.");
314 
315                 if (keyFields.length == 1 && keyFields[0] == 0)
316                 {
317                     distinctKeyIsFullLine = true;
318                 }
319                 else
320                 {
321                     if (keyFields.length > 1 && keyFields.any!(x => x == 0))
322                     {
323                         throw new Exception("Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.");
324                     }
325 
326                     keyFields.each!((ref x) => --x);  // Convert to zero-based indexing.
327                 }
328             }
329 
330             /* Inclusion probability (--p|prob) is used for both Bernoulli sampling and distinct sampling. */
331             if (!inclusionProbability.isNaN)
332             {
333                 if (inclusionProbability <= 0.0 || inclusionProbability > 1.0)
334                 {
335                     import std.format : format;
336                     throw new Exception(
337                         format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability));
338                 }
339 
340                 if (keyFields.length > 0) useDistinctSampling = true;
341                 else useBernoulliSampling = true;
342 
343                 if (hasWeightField) throw new Exception("--w|weight-field and --p|prob cannot be used together.");
344 
345                 if (genRandomInorder && !useDistinctSampling)
346                 {
347                     throw new Exception("--q|gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used.");
348                 }
349             }
350             else if (genRandomInorder && !hasWeightField)
351             {
352                 useBernoulliSampling = true;
353             }
354 
355             if (randomValueHeader.length == 0 || randomValueHeader.canFind('\n') ||
356                 randomValueHeader.canFind(delim))
357             {
358                 throw new Exception("--randomValueHeader must be at least one character and not contain field delimiters or newlines.");
359             }
360 
361             /* Random value printing implies compatibility-mode, otherwise user's selection is used. */
362             if (printRandom || genRandomInorder) compatibilityMode = true;
363 
364             /* Seed. */
365             import std.random : unpredictableSeed;
366 
367             usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0);
368 
369             if (usingUnpredictableSeed) seed = unpredictableSeed;
370             else if (seedValueOptionArg != 0) seed = seedValueOptionArg;
371             else if (staticSeed) seed = 2438424139;
372             else assert(0, "Internal error, invalid seed option states.");
373 
374             /* Assume remaining args are files. Use standard input if files were not provided. */
375             files ~= (cmdArgs.length > 1) ? cmdArgs[1..$] : ["-"];
376             cmdArgs.length = 1;
377         }
378         catch (Exception exc)
379         {
380             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
381             return tuple(false, 1);
382         }
383         return tuple(true, 0);
384     }
385 }
386 /** Invokes the appropriate sampling routine based on the command line arguments.
387  */
388 void tsvSample(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
389 if (isOutputRange!(OutputRange, char))
390 {
391     if (cmdopt.srsWithReplacement)
392     {
393         simpleRandomSamplingWithReplacement(cmdopt, outputStream);
394     }
395     else if (cmdopt.useBernoulliSampling)
396     {
397         bernoulliSamplingCommand(cmdopt, outputStream);
398     }
399     else if (cmdopt.useDistinctSampling)
400     {
401         if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
402         else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream);
403     }
404     else if (cmdopt.genRandomInorder)
405     {
406         /* Note that the preceeding cases handle gen-random-inorder themselves (Bernoulli,
407          * Distinct), or don't handle it (SRS w/ Replacement).
408          */
409         assert(cmdopt.hasWeightField);
410         generateWeightedRandomValuesInorder(cmdopt, outputStream);
411     }
412     else if (cmdopt.sampleSize != 0)
413     {
414         reservoirSamplingCommand(cmdopt, outputStream);
415     }
416     else
417     {
418         randomizeLinesCommand(cmdopt, outputStream);
419     }
420 }
421 
422 /** Invokes the appropriate Bernoulli sampling routine based on the command line arguments.
423  *
424  * This routine selects the appropriate bernoulli sampling function and template
425  * instantiation to use based on the command line arguments.
426  *
427  * See the bernoulliSkipSampling routine for a discussion of the choices behind the
428  * skipSamplingProbabilityThreshold used here.
429  */
430 void bernoulliSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
431 if (isOutputRange!(OutputRange, char))
432 {
433     assert(!cmdopt.hasWeightField);
434 
435     immutable double skipSamplingProbabilityThreshold = 0.04;
436 
437     if (cmdopt.compatibilityMode ||
438         (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling))
439     {
440         if (cmdopt.genRandomInorder)
441         {
442             bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
443         }
444         else
445         {
446             bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream);
447         }
448     }
449     else
450     {
451         bernoulliSkipSampling(cmdopt, outputStream);
452     }
453 }
454 
455 /** Bernoulli sampling of lines on the input stream.
456  *
457  * Each input line is a assigned a random value and output if less than
458  * cmdopt.inclusionProbability. The order of the lines is not changed.
459  *
460  * This routine supports random value printing and gen-random-inorder value printing.
461  */
462 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
463     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
464 if (isOutputRange!(OutputRange, char))
465 {
466     import std.random : Random = Mt19937, uniform01;
467     import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
468 
469     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
470     else assert(!cmdopt.genRandomInorder);
471 
472     auto randomGenerator = Random(cmdopt.seed);
473 
474     /* Process each line. */
475     bool headerWritten = false;
476     size_t numLinesWritten = 0;
477     foreach (filename; cmdopt.files)
478     {
479         auto inputStream = (filename == "-") ? stdin : filename.File();
480         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
481         {
482             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
483             if (fileLineNum == 1 && cmdopt.hasHeader)
484             {
485                 if (!headerWritten)
486                 {
487                     static if (generateRandomAll)
488                     {
489                         outputStream.put(cmdopt.randomValueHeader);
490                         outputStream.put(cmdopt.delim);
491                     }
492                     else if (cmdopt.printRandom)
493                     {
494                         outputStream.put(cmdopt.randomValueHeader);
495                         outputStream.put(cmdopt.delim);
496                     }
497 
498                     outputStream.put(line);
499                     outputStream.put("\n");
500                     headerWritten = true;
501                 }
502             }
503             else
504             {
505                 double lineScore = uniform01(randomGenerator);
506 
507                 static if (generateRandomAll)
508                 {
509                     outputStream.formatRandomValue(lineScore);
510                     outputStream.put(cmdopt.delim);
511                     outputStream.put(line);
512                     outputStream.put("\n");
513 
514                     if (cmdopt.sampleSize != 0)
515                     {
516                         ++numLinesWritten;
517                         if (numLinesWritten == cmdopt.sampleSize) return;
518                     }
519                 }
520                 else if (lineScore < cmdopt.inclusionProbability)
521                 {
522                     if (cmdopt.printRandom)
523                     {
524                         outputStream.formatRandomValue(lineScore);
525                         outputStream.put(cmdopt.delim);
526                     }
527                     outputStream.put(line);
528                     outputStream.put("\n");
529 
530                     if (cmdopt.sampleSize != 0)
531                     {
532                         ++numLinesWritten;
533                         if (numLinesWritten == cmdopt.sampleSize) return;
534                     }
535                 }
536             }
537         }
538     }
539 }
540 
541 /** bernoulliSkipSampling is an implementation of Bernoulli sampling using skips.
542  *
543  * Skip sampling works by skipping a random number of lines between selections. This
544  * can be faster than assigning a random value to each line when the inclusion
545  * probability is low, as it reduces the number of calls to the random number
546  * generator. Both the random number generator and the log() function as called when
547  * calculating the next skip size. These additional log() calls add up as the
548  * probability increases.
549  *
550  * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for
551  * file-oriented line sampling. This is obviously environment specific. In the
552  * environments this implementation has been tested in the perfmance improvements
553  * remain small, less than 7%, even with an inclusion probability as low as 0.0001.
554  *
555  * The algorithm does not assign random values to individual lines. This makes it
556  * incompatible with random value printing. It is not suitable for compatibility mode
557  * either. As an example, in compatibility mode a line selected with '--prob 0.2' should
558  * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling
559  * does not have this property.
560  *
561  * The algorithm for calculating the skip size has been described by multiple sources.
562  * There are two key variants depending on whether the total number of lines in the
563  * data set is known in advance. (This implementation does not know the total.)
564  * Useful references:
565  * $(LIST
566  *     * Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling",
567  *       ACM Trans on Mathematical Software, 1987. On-line:
568  *       http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf
569  *     * P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book
570  *       "Data Stream Management", Springer-Verlag, 2016. On-line:
571  *       https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf
572  *     * Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line:
573  *       http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/
574  * )
575  */
576 void bernoulliSkipSampling(OutputRange)(TsvSampleOptions cmdopt, OutputRange outputStream)
577     if (isOutputRange!(OutputRange, char))
578 {
579     import std.conv : to;
580     import std.math : log, trunc;
581     import std.random : Random = Mt19937, uniform01;
582     import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
583 
584     assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0);
585     assert(!cmdopt.printRandom);
586     assert(!cmdopt.compatibilityMode);
587 
588     auto randomGenerator = Random(cmdopt.seed);
589 
590     immutable double discardRate = 1.0 - cmdopt.inclusionProbability;
591     immutable double logDiscardRate = log(discardRate);
592 
593     /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed
594      * interval to (0.0, 1.0], excluding 0.0.
595      */
596     size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
597 
598     /* Process each line. */
599     bool headerWritten = false;
600     size_t numLinesWritten = 0;
601     foreach (filename; cmdopt.files)
602     {
603         auto inputStream = (filename == "-") ? stdin : filename.File();
604         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
605         {
606             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
607             if (fileLineNum == 1 && cmdopt.hasHeader)
608             {
609                 if (!headerWritten)
610                 {
611                     outputStream.put(line);
612                     outputStream.put("\n");
613                     headerWritten = true;
614                 }
615             }
616             else if (remainingSkips > 0)
617             {
618                 --remainingSkips;
619             }
620             else
621             {
622                 outputStream.put(line);
623                 outputStream.put("\n");
624 
625                 if (cmdopt.sampleSize != 0)
626                 {
627                     ++numLinesWritten;
628                     if (numLinesWritten == cmdopt.sampleSize) return;
629                 }
630 
631                 remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
632             }
633         }
634     }
635 }
636 
637 /** Sample a subset of the unique values from the key fields.
638  *
639  * Distinct sampling is done by hashing the key and mapping the hash value into
640  * buckets matching the inclusion probability. Records having a key mapping to bucket
641  * zero are output.
642  */
643 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
644     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
645 if (isOutputRange!(OutputRange, char))
646 {
647     import std.algorithm : splitter;
648     import std.conv : to;
649     import std.digest.murmurhash;
650     import std.math : lrint;
651     import tsv_utils.common.utils : InputFieldReordering, throwIfWindowsNewlineOnUnix;
652 
653     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
654     else assert(!cmdopt.genRandomInorder);
655 
656     assert(cmdopt.keyFields.length > 0);
657     assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0);
658 
659     static if (generateRandomAll)
660     {
661         import std.format : formatValue, singleSpec;
662         immutable randomValueFormatSpec = singleSpec("%d");
663     }
664 
665     immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys.
666 
667     uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint;
668 
669     /* Create a mapping for the key fields. */
670     auto keyFieldsReordering = cmdopt.distinctKeyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields);
671 
672     /* Process each line. */
673     bool headerWritten = false;
674     size_t numLinesWritten = 0;
675     foreach (filename; cmdopt.files)
676     {
677         auto inputStream = (filename == "-") ? stdin : filename.File();
678         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
679         {
680             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
681             if (fileLineNum == 1 && cmdopt.hasHeader)
682             {
683                 if (!headerWritten)
684                 {
685                     static if (generateRandomAll)
686                     {
687                         outputStream.put(cmdopt.randomValueHeader);
688                         outputStream.put(cmdopt.delim);
689                     }
690                     else if (cmdopt.printRandom)
691                     {
692                         outputStream.put(cmdopt.randomValueHeader);
693                         outputStream.put(cmdopt.delim);
694                     }
695 
696                     outputStream.put(line);
697                     outputStream.put("\n");
698                     headerWritten = true;
699                 }
700             }
701             else
702             {
703                 /* Murmurhash works by successively adding individual keys, then finalizing.
704                  * Adding individual keys is simpler if the full-line-as-key and individual
705                  * fields as keys cases are separated.
706                  */
707                 auto hasher = MurmurHash3!32(cmdopt.seed);
708 
709                 if (cmdopt.distinctKeyIsFullLine)
710                 {
711                     hasher.put(cast(ubyte[]) line);
712                 }
713                 else
714                 {
715                     assert(keyFieldsReordering !is null);
716 
717                     /* Gather the key field values and assemble the key. */
718                     keyFieldsReordering.initNewLine;
719                     foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
720                     {
721                         keyFieldsReordering.processNextField(fieldIndex, fieldValue);
722                         if (keyFieldsReordering.allFieldsFilled) break;
723                     }
724 
725                     if (!keyFieldsReordering.allFieldsFilled)
726                     {
727                         import std.format : format;
728                         throw new Exception(
729                             format("Not enough fields in line. File: %s, Line: %s",
730                                    (filename == "-") ? "Standard Input" : filename, fileLineNum));
731                     }
732 
733                     foreach (count, key; keyFieldsReordering.outputFields.enumerate)
734                     {
735                         if (count > 0) hasher.put(delimArray);
736                         hasher.put(cast(ubyte[]) key);
737                     }
738                 }
739 
740                 hasher.finish;
741 
742                 static if (generateRandomAll)
743                 {
744                     import std.conv : to;
745                     outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec);
746                     outputStream.put(cmdopt.delim);
747                     outputStream.put(line);
748                     outputStream.put("\n");
749 
750                     if (cmdopt.sampleSize != 0)
751                     {
752                         ++numLinesWritten;
753                         if (numLinesWritten == cmdopt.sampleSize) return;
754                     }
755                 }
756                 else if (hasher.get % numBuckets == 0)
757                 {
758                     if (cmdopt.printRandom)
759                     {
760                         outputStream.put('0');
761                         outputStream.put(cmdopt.delim);
762                     }
763                     outputStream.put(line);
764                     outputStream.put("\n");
765 
766                     if (cmdopt.sampleSize != 0)
767                     {
768                         ++numLinesWritten;
769                         if (numLinesWritten == cmdopt.sampleSize) return;
770                     }
771                 }
772             }
773         }
774     }
775 }
776 
777 /** Invokes the appropriate reservoir sampling routine based on the command line
778  * arguments.
779  *
780  * This routine selects the appropriate reservior sampling function and template
781  * instantiation to use based on the command line arguments.
782  *
783  * Reservoir sampling is used when a fixed size sample is being pulled from an input
784  * stream. Weighted and unweighted sampling is supported. These routines also
785  * randomize the order of the selected lines. This is consistent with line order
786  * randomization of the entire input stream (handled by randomizeLinesCommand).
787  *
788  * For unweighted sampling, there is a performance tradeoff choice between the two
789  * available implementations. See the reservoirSampling documentation for
790  * information. The threshold used here was chosen based on performance tests.
791  */
792 
793 void reservoirSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
794 if (isOutputRange!(OutputRange, char))
795 {
796     assert(cmdopt.sampleSize != 0);
797 
798     immutable size_t algorithmRSampleSizeThreshold = 128 * 1024;
799 
800     if (cmdopt.hasWeightField)
801     {
802         reservoirSamplingViaHeap!(Yes.isWeighted)(cmdopt, outputStream);
803     }
804     else if (cmdopt.compatibilityMode ||
805              (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR))
806     {
807         reservoirSamplingViaHeap!(No.isWeighted)(cmdopt, outputStream);
808     }
809     else
810     {
811         reservoirSamplingAlgorithmR(cmdopt, outputStream);
812     }
813 }
814 
815 /** Reservior sampling using a heap. Both weighted and unweighted random sampling are
816  * supported.
817  *
818  * The algorithm used here is based on the one-pass algorithm described by Pavlos
819  * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S.
820  * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are
821  * simply set to one.
822  *
823  * The implementation uses a heap (priority queue) large enough to hold the desired
824  * number of lines. Input is read line-by-line, assigned a random value, and added to
825  * the heap. The role of the identify the lines with the highest assigned random
826  * values. Once the heap is full, adding a new line means dropping the line with the
827  * lowest score. A "min" heap used for this reason.
828  *
829  * When done reading all lines, the "min" heap is in the opposite order needed for
830  * output. The desired order is obtained by removing each element one at at time from
831  * the heap. The underlying data store will have the elements in correct order.
832  *
833  * Generating output in weighted order matters for several reasons:
834  *  - For weighted sampling, it preserves the property that smaller valid subsets can be
835  *    created by taking the first N lines.
836  *  - For unweighted sampling, it ensures that all output permutations are possible, and
837  *    are not influences by input order or the heap data structure used.
838  *  - Order consistency when making repeated use of the same random seeds, but with
839  *    different sample sizes.
840  *
841  * There are use cases where only the selection set matters, for these some performance
842  * could be gained by skipping the reordering and simply printing the backing store
843  * array in-order, but making this distinction seems an unnecessary complication.
844  *
845  * Notes:
846  * $(LIST
847  *    * In tsv-sample versions 1.2.1 and earlier this routine also supported
848  *      randomization of all input lines. This was dropped in version 1.2.2 in favor
849  *      of the approach used in randomizeLines. The latter has significant advantages
850  *      given that all data data must be read into memory.
851  *    * For larger reservoir sizes better performance can be achieved by using
852  *      reservoirSamplingAlgorithmR. See the documentation of that function for details.
853  * )
854  */
855 void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, OutputRange)
856     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
857 if (isOutputRange!(OutputRange, char))
858 {
859     import std.container.array;
860     import std.container.binaryheap;
861     import std.random : Random = Mt19937, uniform01;
862     import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
863 
864     static if (isWeighted) assert(cmdopt.hasWeightField);
865     else assert(!cmdopt.hasWeightField);
866 
867     assert(cmdopt.sampleSize > 0);
868 
869     auto randomGenerator = Random(cmdopt.seed);
870 
871     struct Entry
872     {
873         double score;
874         char[] line;
875     }
876 
877     /* Create the heap and backing data store.
878      *
879      * Note: An std.container.array is used as the backing store to avoid some issues in
880      * the standard library (Phobos) binaryheap implementation. Specifically, when an
881      * std.container.array is used as backing store, the heap can efficiently reversed by
882      * removing the heap elements. This leaves the backing store in the reversed order.
883      * However, the current binaryheap implementation does not support this for all
884      * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094.
885      */
886 
887     Array!Entry dataStore;
888     dataStore.reserve(cmdopt.sampleSize);
889     auto reservoir = dataStore.heapify!("a.score > b.score")(0);  // Min binaryheap
890 
891     /* Process each line. */
892     bool headerWritten = false;
893     foreach (filename; cmdopt.files)
894     {
895         auto inputStream = (filename == "-") ? stdin : filename.File();
896         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
897         {
898             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
899             if (fileLineNum == 1 && cmdopt.hasHeader)
900             {
901                 if (!headerWritten)
902                 {
903                     if (cmdopt.printRandom)
904                     {
905                         outputStream.put(cmdopt.randomValueHeader);
906                         outputStream.put(cmdopt.delim);
907                     }
908                     outputStream.put(line);
909                     outputStream.put("\n");
910                     headerWritten = true;
911                 }
912             }
913             else
914             {
915                 static if (!isWeighted)
916                 {
917                     double lineScore = uniform01(randomGenerator);
918                 }
919                 else
920                 {
921                     double lineWeight =
922                         getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, filename, fileLineNum);
923                     double lineScore =
924                         (lineWeight > 0.0)
925                         ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
926                         : 0.0;
927                 }
928 
929                 if (reservoir.length < cmdopt.sampleSize)
930                 {
931                     reservoir.insert(Entry(lineScore, line.dup));
932                 }
933                 else if (reservoir.front.score < lineScore)
934                 {
935                     reservoir.replaceFront(Entry(lineScore, line.dup));
936                 }
937             }
938         }
939     }
940 
941     /* All entries are in the reservoir. Time to print. The heap is in reverse order
942      * of assigned weights. Reversing order is done by removing all elements from the
943      * heap, this leaves the backing store in the correct order for output.
944      *
945      * The asserts here avoid issues with the current binaryheap implementation. They
946      * detect use of backing stores having a length not synchronized to the reservoir.
947      */
948     size_t numLines = reservoir.length;
949     assert(numLines == dataStore.length);
950 
951     while (!reservoir.empty) reservoir.removeFront;
952     assert(numLines == dataStore.length);
953 
954     foreach (entry; dataStore)
955     {
956         if (cmdopt.printRandom)
957         {
958             outputStream.formatRandomValue(entry.score);
959             outputStream.put(cmdopt.delim);
960         }
961         outputStream.put(entry.line);
962         outputStream.put("\n");
963     }
964  }
965 
966 /** Generates weighted random values for all input lines, preserving input order.
967  *
968  * This complements weighted reservoir sampling, but instead of using a reservoir it
969  * simply iterates over the input lines generating the values. The weighted random
970  * values are generated with the same formula used by reservoirSampling.
971  */
972 void generateWeightedRandomValuesInorder(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
973 if (isOutputRange!(OutputRange, char))
974 {
975     import std.random : Random = Mt19937, uniform01;
976     import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
977 
978     assert(cmdopt.hasWeightField);
979 
980     auto randomGenerator = Random(cmdopt.seed);
981 
982     /* Process each line. */
983     bool headerWritten = false;
984     size_t numLinesWritten = 0;
985     foreach (filename; cmdopt.files)
986     {
987         auto inputStream = (filename == "-") ? stdin : filename.File();
988         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
989         {
990             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
991             if (fileLineNum == 1 && cmdopt.hasHeader)
992             {
993                 if (!headerWritten)
994                 {
995                     outputStream.put(cmdopt.randomValueHeader);
996                     outputStream.put(cmdopt.delim);
997                     outputStream.put(line);
998                     outputStream.put("\n");
999                     headerWritten = true;
1000                 }
1001             }
1002             else
1003                {
1004                 double lineWeight = getFieldValue!double(line, cmdopt.weightField, cmdopt.delim,
1005                                                          filename, fileLineNum);
1006                 double lineScore =
1007                     (lineWeight > 0.0)
1008                     ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1009                     : 0.0;
1010 
1011                 outputStream.formatRandomValue(lineScore);
1012                 outputStream.put(cmdopt.delim);
1013                 outputStream.put(line);
1014                 outputStream.put("\n");
1015 
1016                 if (cmdopt.sampleSize != 0)
1017                 {
1018                     ++numLinesWritten;
1019                     if (numLinesWritten == cmdopt.sampleSize) return;
1020                 }
1021             }
1022         }
1023     }
1024 }
1025 
1026 /** Reservoir sampling via Algorithm R
1027  *
1028  * This is an implementation of reservoir sampling using what is commonly known as
1029  * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of
1030  * Computer Programming, Volume 2: Seminumerical Algorithms". More information about
1031  * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with
1032  * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling"
1033  * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R).
1034  *
1035  * Algorithm R is used for unweighted sampling without replacement. The heap-based
1036  * algorithm in reservoirSamplingViaHeap is used for weighted sampling.
1037  *
1038  * The classic algorithm stops after identifying the selected set of items. This
1039  * implementation goes one step further and randomizes the order of the selected
1040  * lines. This supports the tsv-sample use-case, which is line order randomization.
1041  *
1042  * This algorithm is faster than reservoirSamplingViaHeap when the sample size
1043  * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size.
1044  * Insertion in this algorithm is O(1). Similarly, generating the random order in the
1045  * heap is O(k * log k), while in this algorithm the final randomization step is O(k).
1046  *
1047  * This speed advantage may be offset a certain amount by using a more expensive random
1048  * value generator. reservoirSamplingViaHeap generates values between zero and one,
1049  * whereas reservoirSamplingAlgorithR generates random integers over and ever growing
1050  * interval. The latter is expected to be more expensive. This is consistent with
1051  * performance test indicating that reservoirSamplingViaHeap is faster when using
1052  * small-to-medium size reservoirs and large input streams.
1053  */
1054 void reservoirSamplingAlgorithmR(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1055 if (isOutputRange!(OutputRange, char))
1056 {
1057     import std.random : Random = Mt19937, randomShuffle, uniform;
1058     import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
1059 
1060     assert(cmdopt.sampleSize > 0);
1061     assert(!cmdopt.hasWeightField);
1062     assert(!cmdopt.compatibilityMode);
1063     assert(!cmdopt.printRandom);
1064     assert(!cmdopt.genRandomInorder);
1065 
1066     string[] reservoir;
1067     auto reservoirAppender = appender(&reservoir);
1068     reservoirAppender.reserve(cmdopt.sampleSize);
1069 
1070     auto randomGenerator = Random(cmdopt.seed);
1071 
1072     /* Process each line. */
1073 
1074     bool headerWritten = false;
1075     size_t totalLineNum = 0;
1076     foreach (filename; cmdopt.files)
1077     {
1078         auto inputStream = (filename == "-") ? stdin : filename.File();
1079         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
1080         {
1081             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
1082             if (fileLineNum == 1 && cmdopt.hasHeader)
1083             {
1084                 if (!headerWritten)
1085                 {
1086                     outputStream.put(line);
1087                     outputStream.put("\n");
1088                     headerWritten = true;
1089                 }
1090             }
1091             else
1092             {
1093                 /* Add lines to the reservoir until the reservoir is filled.
1094                  * After that lines are added with decreasing likelihood, based on
1095                  * the total number of lines seen. If added to the reservoir, the
1096                  * line replaces a randomly chosen existing line.
1097                  */
1098                 if (totalLineNum < cmdopt.sampleSize)
1099                 {
1100                     reservoirAppender ~= line.idup;
1101                 }
1102                 else
1103                 {
1104                     size_t i = uniform(0, totalLineNum, randomGenerator);
1105                     if (i < reservoir.length) reservoir[i] = line.idup;
1106                 }
1107 
1108                 ++totalLineNum;
1109             }
1110         }
1111     }
1112 
1113     /* The random sample is now in the reservior. Shuffle it and print. */
1114 
1115     reservoir.randomShuffle(randomGenerator);
1116 
1117     foreach (ref line; reservoir)
1118     {
1119         outputStream.put(line);
1120         outputStream.put("\n");
1121     }
1122 }
1123 
1124 /** Invokes the appropriate routine to randomize input lines based on the command line
1125  * arguments.
1126  *
1127  * This routine selects the appropriate randomize lines function and template instantiation
1128  * to use based on the command line arguments.
1129  */
1130 void randomizeLinesCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1131 if (isOutputRange!(OutputRange, char))
1132 {
1133     if (cmdopt.hasWeightField)
1134     {
1135         randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream);
1136     }
1137     else if (cmdopt.compatibilityMode)
1138     {
1139         randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream);
1140     }
1141     else
1142     {
1143         randomizeLinesViaShuffle(cmdopt, outputStream);
1144     }
1145 }
1146 
1147 /** Randomize all the lines in files or standard input using assigned random weights
1148  * and sorting.
1149  *
1150  * All lines in files and/or standard input are read in and written out in random
1151  * order. This algorithm assigns a random value to each line and sorts. This approach
1152  * supports both weighted sampling and simple random sampling (unweighted).
1153  *
1154  * This is significantly faster than heap-based reservoir sampling in the case where
1155  * the entire file is being read. See also randomizeLinesViaShuffle for the unweighted
1156  * case, as it is a little faster, at the cost not supporting random value printing or
1157  * compatibility-mode.
1158  *
1159  * Input data size is limited by available memory. Disk oriented techniques are needed
1160  * when data sizes are larger. For example, generating random values line-by-line (ala
1161  * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort.
1162  */
1163 void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1164 if (isOutputRange!(OutputRange, char))
1165 {
1166     import std.algorithm : map, sort;
1167 
1168     static if (isWeighted) assert(cmdopt.hasWeightField);
1169     else assert(!cmdopt.hasWeightField);
1170 
1171     assert(cmdopt.sampleSize == 0);
1172 
1173     /*
1174      * Read all file data into memory. Then split the data into lines and assign a
1175      * random value to each line. identifyFileLines also writes the first header line.
1176      */
1177     auto fileData = cmdopt.files.map!FileData.array;
1178     auto inputLines = fileData.identifyFileLines!(Yes.hasRandomValue, isWeighted)(cmdopt, outputStream);
1179 
1180     /*
1181      * Sort by the weight and output the lines.
1182      */
1183     inputLines.sort!((a, b) => a.randomValue > b.randomValue);
1184 
1185     foreach (lineEntry; inputLines)
1186     {
1187         if (cmdopt.printRandom)
1188         {
1189             outputStream.formatRandomValue(lineEntry.randomValue);
1190             outputStream.put(cmdopt.delim);
1191         }
1192         outputStream.put(lineEntry.data);
1193         outputStream.put("\n");
1194     }
1195 }
1196 
1197 /** Randomize all the lines in files or standard input using a shuffling algorithm.
1198  *
1199  * All lines in files and/or standard input are read in and written out in random
1200  * order. This routine uses array shuffling, which is faster than sorting. This makes
1201  * this routine a good alternative to randomizeLinesViaSort when doing unweighted
1202  * randomization.
1203  *
1204  * Input data size is limited by available memory. Disk oriented techniques are needed
1205  * when data sizes are larger. For example, generating random values line-by-line (ala
1206  * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort.
1207  *
1208  * This routine does not support random value printing or compatibility-mode.
1209  */
1210 void randomizeLinesViaShuffle(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1211 if (isOutputRange!(OutputRange, char))
1212 {
1213     import std.algorithm : map;
1214     import std.random : Random = Mt19937, randomShuffle;
1215 
1216     assert(cmdopt.sampleSize == 0);
1217     assert(!cmdopt.hasWeightField);
1218     assert(!cmdopt.printRandom);
1219     assert(!cmdopt.genRandomInorder);
1220 
1221     /*
1222      * Read all file data into memory and split into lines.
1223      */
1224     auto fileData = cmdopt.files.map!FileData.array;
1225     auto inputLines = fileData.identifyFileLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream);
1226 
1227     /*
1228      * Randomly shuffle and print each line.
1229      *
1230      * Note: Also tried randomCover, but that was exceedingly slow.
1231      */
1232     import std.random : randomShuffle;
1233 
1234     auto randomGenerator = Random(cmdopt.seed);
1235     inputLines.randomShuffle(randomGenerator);
1236 
1237     foreach (ref line; inputLines)
1238     {
1239         outputStream.put(line.data);
1240         outputStream.put("\n");
1241     }
1242 }
1243 
1244 /** Simple random sampling with replacement.
1245  *
1246  * All lines in files and/or standard input are read in. Then random lines are selected
1247  * one at a time and output. Lines can be selected multiple times. This process continues
1248  * until the desired number of samples (--n|num) has been output. Output continues
1249  * indefinitely if a sample size was not provided.
1250  */
1251 void simpleRandomSamplingWithReplacement(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1252 if (isOutputRange!(OutputRange, char))
1253 {
1254     import std.algorithm : map;
1255     import std.random : Random = Mt19937, uniform;
1256 
1257     /*
1258      * Read all file data into memory and split the data into lines.
1259      */
1260     auto fileData = cmdopt.files.map!FileData.array;
1261     auto inputLines = fileData.identifyFileLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream);
1262 
1263     if (inputLines.length > 0)
1264     {
1265         auto randomGenerator = Random(cmdopt.seed);
1266 
1267         /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */
1268         size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize;
1269         while (numLeft != 0)
1270         {
1271             size_t index = uniform(0, inputLines.length, randomGenerator);
1272             outputStream.put(inputLines[index].data);
1273             outputStream.put("\n");
1274             if (cmdopt.sampleSize != 0) numLeft--;
1275         }
1276     }
1277 }
1278 
1279 /** A container and reader data form a file or standard input.
1280  *
1281  * The FileData struct is used to read data from a file or standard input. It is used
1282  * by passing a filename to the constructor. The constructor reads the file data.
1283  * If the filename is a single hyphen ('-') then data is read from standard input.
1284  *
1285  * The struct make the data available through two members: 'filename', which is the
1286  * filename, and 'data', which is a character array of the data.
1287  */
1288 struct FileData
1289 {
1290     string filename;
1291     char[] data;
1292 
1293     this(string fname)
1294     {
1295         import std.algorithm : min;
1296         import std.array : appender;
1297 
1298         filename = fname;
1299 
1300         ubyte[1024 * 128] fileRawBuf;
1301         auto dataAppender = appender(&data);
1302         auto ifile = (filename == "-") ? stdin : filename.File;
1303 
1304         if (filename != "-")
1305         {
1306             ulong filesize = ifile.size;
1307             if (filesize < ulong.max) dataAppender.reserve(min(filesize, size_t.max));
1308         }
1309 
1310         foreach (ref ubyte[] buffer; ifile.byChunk(fileRawBuf)) dataAppender.put(cast(char[]) buffer);
1311     }
1312 }
1313 
1314 /** HasRandomValue is a boolean flag used at compile time by identifyFileLines to
1315  * distinguish use cases needing random value assignments from those that don't.
1316  */
1317 alias HasRandomValue = Flag!"hasRandomValue";
1318 
1319 /** An InputLine array is returned by identifyFileLines to represent each non-header line
1320  * line found in a FileData array. The 'data' element contains the line. A 'randomValue'
1321  * line is included if random values are being generated.
1322  */
1323 struct InputLine(HasRandomValue hasRandomValue)
1324 {
1325     char[] data;
1326     static if (hasRandomValue) double randomValue;
1327 }
1328 
1329 /** identifyFileLines is used by algorithms that read all files into memory prior to
1330  * processing. It does the initial processing of the file data.
1331  *
1332  * Three primary tasks are performed. One is splitting all input data into lines. The
1333  * second is writting the header line from the first file to the output stream. Header
1334  * lines from subsequent files are ignored. Third is assigning a random value to the
1335  * line, if random values are being generated.
1336  *
1337  * The key input is a FileData array, one element for each file. The FileData reads
1338  * the file when instantiated.
1339  *
1340  * The return value is an array of InputLine structs. The struct will have a 'randomValue'
1341  * member if random values are being assigned.
1342  */
1343 InputLine!hasRandomValue[] identifyFileLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted, OutputRange)
1344 (ref FileData[] fileData, TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1345 if (isOutputRange!(OutputRange, char))
1346 {
1347     import std.algorithm : splitter;
1348     import std.array : appender;
1349     import std.random : Random = Mt19937, uniform01;
1350     import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
1351 
1352     static assert(hasRandomValue || !isWeighted);
1353     static if(!hasRandomValue) assert(!cmdopt.printRandom);
1354 
1355     InputLine!hasRandomValue[] inputLines;
1356 
1357     auto linesAppender = appender(&inputLines);
1358     static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed);
1359     bool headerWritten = false;
1360 
1361     foreach (fd; fileData)
1362     {
1363         /* Drop the last newline to avoid adding an extra empty line. */
1364         auto data = (fd.data.length > 0 && fd.data[$ - 1] == '\n') ? fd.data[0 .. $ - 1] : fd.data;
1365         foreach (fileLineNum, ref line; data.splitter('\n').enumerate(1))
1366         {
1367             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, fd.filename, fileLineNum);
1368             if (fileLineNum == 1 && cmdopt.hasHeader)
1369             {
1370                 if (!headerWritten)
1371                 {
1372                     if (cmdopt.printRandom)
1373                     {
1374                         outputStream.put(cmdopt.randomValueHeader);
1375                         outputStream.put(cmdopt.delim);
1376                     }
1377                     outputStream.put(line);
1378                     outputStream.put("\n");
1379                     headerWritten = true;
1380                 }
1381             }
1382             else
1383             {
1384                 static if (!hasRandomValue)
1385                 {
1386                     linesAppender.put(InputLine!hasRandomValue(line));
1387                 }
1388                 else
1389                 {
1390                     static if (!isWeighted)
1391                     {
1392                         double randomValue = uniform01(randomGenerator);
1393                     }
1394                     else
1395                     {
1396                         double lineWeight =
1397                             getFieldValue!double(line, cmdopt.weightField, cmdopt.delim,
1398                                                  fd.filename, fileLineNum);
1399                         double randomValue =
1400                             (lineWeight > 0.0)
1401                             ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1402                             : 0.0;
1403                     }
1404 
1405                     linesAppender.put(InputLine!hasRandomValue(line, randomValue));
1406                 }
1407             }
1408         }
1409     }
1410 
1411     return inputLines;
1412 }
1413 
1414 /** Write a floating point random value to an output stream.
1415  *
1416  * This routine is used for floating point random value printing. This routine writes
1417  * 17 significant digits, the range available in doubles. This routine prefers decimal
1418  * format, without exponents. It will generate somewhat large precision numbers,
1419  * currently up to 28 digits, before switching to exponents.
1420  *
1421  * The primary reason for this approach is to enable faster sorting on random values
1422  * by GNU sort and similar external sorting programs. GNU sort is dramatically faster
1423  * on decimal format numeric sorts ('n' switch) than general numeric sorts ('g' switch).
1424  * The 'general numeric' handles exponential notation. The difference is 5-10x.
1425  *
1426  * Random values generated by Bernoulli sampling are nearly always greater than 1e-12.
1427  * No examples less than 1e-09 were seen in hundred of millions of trials. Similar
1428  * results were seen with weighted sampling with integer weights. The same is not true
1429  * with floating point weights. These produce quite large exponents. However, even
1430  * for floating point weights this can be useful. For random weights [0,1] less than 5%
1431  * will be less than 1e-12 and use exponential notation.
1432  */
1433 void formatRandomValue(OutputRange)(auto ref OutputRange outputStream, double value)
1434 if (isOutputRange!(OutputRange, char))
1435 {
1436     import std.format : formatValue, singleSpec;
1437 
1438     immutable spec17f = singleSpec("%.17f");
1439     immutable spec18f = singleSpec("%.18f");
1440     immutable spec19f = singleSpec("%.19f");
1441     immutable spec20f = singleSpec("%.20f");
1442     immutable spec21f = singleSpec("%.21f");
1443     immutable spec22f = singleSpec("%.22f");
1444     immutable spec23f = singleSpec("%.23f");
1445     immutable spec24f = singleSpec("%.24f");
1446     immutable spec25f = singleSpec("%.25f");
1447     immutable spec26f = singleSpec("%.26f");
1448     immutable spec27f = singleSpec("%.27f");
1449     immutable spec28f = singleSpec("%.28f");
1450 
1451     immutable spec17g = singleSpec("%.17g");
1452 
1453     auto formatSpec =
1454         (value >= 1e-01) ? spec17f :
1455         (value >= 1e-02) ? spec18f :
1456         (value >= 1e-03) ? spec19f :
1457         (value >= 1e-04) ? spec20f :
1458         (value >= 1e-05) ? spec21f :
1459         (value >= 1e-06) ? spec22f :
1460         (value >= 1e-07) ? spec23f :
1461         (value >= 1e-08) ? spec24f :
1462         (value >= 1e-09) ? spec25f :
1463         (value >= 1e-10) ? spec26f :
1464         (value >= 1e-11) ? spec27f :
1465         (value >= 1e-12) ? spec28f : spec17g;
1466 
1467     outputStream.formatValue(value, formatSpec);
1468 }
1469 
1470 unittest
1471 {
1472     void testFormatValue(double value, string expected)
1473     {
1474         import std.array : appender;
1475         import std.format : format;
1476 
1477         auto s = appender!string();
1478         s.formatRandomValue(value);
1479         assert(s.data == expected,
1480                format("[testFormatValue] value: %g; expected: %s; actual: %s", value, expected, s.data));
1481     }
1482 
1483     testFormatValue(1.0,   "1.00000000000000000");
1484     testFormatValue(0.1,   "0.10000000000000001");
1485     testFormatValue(0.01,  "0.010000000000000000");
1486     testFormatValue(1e-03, "0.0010000000000000000");
1487     testFormatValue(1e-04, "0.00010000000000000000");
1488     testFormatValue(1e-05, "0.000010000000000000001");
1489     testFormatValue(1e-06, "0.0000010000000000000000");
1490     testFormatValue(1e-07, "0.00000010000000000000000");
1491     testFormatValue(1e-08, "0.000000010000000000000000");
1492     testFormatValue(1e-09, "0.0000000010000000000000001");
1493     testFormatValue(1e-10, "0.00000000010000000000000000");
1494     testFormatValue(1e-11, "0.000000000009999999999999999");
1495     testFormatValue(1e-12, "0.0000000000010000000000000000");
1496     testFormatValue(1e-13, "1e-13");
1497     testFormatValue(1e-14, "1e-14");
1498     testFormatValue(12345678901234567e-15, "12.34567890123456735");
1499     testFormatValue(12345678901234567e-16, "1.23456789012345669");
1500     testFormatValue(12345678901234567e-17, "0.12345678901234566");
1501     testFormatValue(12345678901234567e-18, "0.012345678901234567");
1502     testFormatValue(12345678901234567e-19, "0.0012345678901234567");
1503     testFormatValue(12345678901234567e-20, "0.00012345678901234567");
1504     testFormatValue(12345678901234567e-21, "0.000012345678901234568");
1505     testFormatValue(12345678901234567e-22, "0.0000012345678901234567");
1506     testFormatValue(12345678901234567e-23, "0.00000012345678901234566");
1507     testFormatValue(12345678901234567e-24, "0.000000012345678901234567");
1508     testFormatValue(12345678901234567e-25, "0.0000000012345678901234566");
1509     testFormatValue(12345678901234567e-26, "0.00000000012345678901234568");
1510     testFormatValue(12345678901234567e-27, "0.000000000012345678901234567");
1511     testFormatValue(12345678901234567e-28, "0.0000000000012345678901234567");
1512     testFormatValue(12345678901234567e-29, "1.2345678901234566e-13");
1513 }
1514 
1515 
1516 /** Convenience function for extracting a single field from a line. See
1517  * [tsv_utils.common.utils.getTsvFieldValue] for details. This wrapper creates error
1518  * text tailored for this program.
1519  */
1520 import std.traits : isSomeChar;
1521 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, size_t lineNum) pure @safe
1522 if (isSomeChar!C)
1523 {
1524     import std.conv : ConvException, to;
1525     import std.format : format;
1526     import tsv_utils.common.utils : getTsvFieldValue;
1527 
1528     T val;
1529     try
1530     {
1531         val = getTsvFieldValue!T(line, fieldIndex, delim);
1532     }
1533     catch (ConvException exc)
1534     {
1535         throw new Exception(
1536             format("Could not process line: %s\n  File: %s Line: %s%s",
1537                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
1538                    (lineNum == 1) ? "\n  Is this a header line? Use --H|header to skip." : ""));
1539     }
1540     catch (Exception exc)
1541     {
1542         /* Not enough fields on the line. */
1543         throw new Exception(
1544             format("Could not process line: %s\n  File: %s Line: %s",
1545                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum));
1546     }
1547 
1548     return val;
1549 }
1550 
1551 unittest
1552 {
1553     /* getFieldValue unit tests. getTsvFieldValue has it's own tests.
1554      * These tests make basic sanity checks on the getFieldValue wrapper.
1555      */
1556     import std.exception;
1557 
1558     assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123);
1559     assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4);
1560     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1));
1561     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2));
1562     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1));
1563     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2));
1564 }
1565 
1566 /* Unit tests for the main program start here.
1567  *
1568  * Portability note: Many of the tests here rely on generating consistent random numbers
1569  * across different platforms when using the same random seed. So far this has succeeded
1570  * on several different platorm, compiler, and library versions. However, it is certainly
1571  * possible this condition will not hold on other platforms.
1572  *
1573  * For tsv-sample, this portability implies generating the same results on different
1574  * platforms when using the same random seed. This is NOT part of tsv-sample guarantees,
1575  * but it is convenient for testing. If platforms are identified that do not generate
1576  * the same results these tests will need to be adjusted.
1577  */
1578 version(unittest)
1579 {
1580     /* Unit test helper functions. */
1581 
1582     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
1583     import std.conv : to;
1584 
1585     void testTsvSample(string[] cmdArgs, string[][] expected)
1586     {
1587         import std.array : appender;
1588         import std.format : format;
1589 
1590         assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty.");
1591 
1592         auto formatAssertMessage(T...)(string msg, T formatArgs)
1593         {
1594             auto formatString = "[testTsvSample] %s: " ~ msg;
1595             return format(formatString, cmdArgs[0], formatArgs);
1596         }
1597 
1598         TsvSampleOptions cmdopt;
1599         auto savedCmdArgs = cmdArgs.to!string;
1600         auto r = cmdopt.processArgs(cmdArgs);
1601         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
1602         auto output = appender!(char[])();
1603 
1604         tsvSample(cmdopt, output);    // This invokes the main code line.
1605 
1606         auto expectedOutput = expected.tsvDataToString;
1607 
1608         assert(output.data == expectedOutput,
1609                formatAssertMessage(
1610                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
1611                    expectedOutput.to!string, output.data.to!string));
1612     }
1613  }
1614 
1615 unittest
1616 {
1617     import std.path : buildPath;
1618     import std.file : rmdirRecurse;
1619     import std.format : format;
1620 
1621     auto testDir = makeUnittestTempDir("tsv_sample");
1622     scope(exit) testDir.rmdirRecurse;
1623 
1624     /* Tabular data sets and expected results use the built-in static seed.
1625      * Tests are run by writing the data set to a file, then calling the main
1626      * routine to process. The function testTsvSample plays the role of the
1627      * main program. Rather than writing to expected output, the results are
1628      * matched against expected. The expected results were verified by hand
1629      * prior to inclusion in the test.
1630      *
1631      * The initial part of this section is simply setting up data files and
1632      * expected results.
1633      *
1634      * Expected results naming conventions:
1635      *  - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected
1636      *  - Sampling Type (required): Permute, Replace, Bernoulli, Distinct
1637      *  - Compatibility: Compat, AlgoR, Skip, Swap
1638      *  - Weight Field: Wt<num>, e.g. Wt3
1639      *  - Sample Size: Num<num>, eg. Num3
1640      *  - Seed Value: V<num>, eg. V77
1641      *  - Key Field: K<num>, e.g. K2
1642      *  - Probability: P<num>, e.g P05 (5%)
1643      *  - Printing Probalities: Probs
1644      *  - Printing Probs in order: ProbsInorder
1645      *  - Printing Probs with custom header: RVCustom
1646      */
1647 
1648     /* Empty file. */
1649     string[][] dataEmpty = [];
1650     string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv");
1651     writeUnittestTsvFile(fpath_dataEmpty, dataEmpty);
1652 
1653     /* 3x1, header only. */
1654     string[][] data3x0 = [["field_a", "field_b", "field_c"]];
1655     string fpath_data3x0 = buildPath(testDir, "data3x0.tsv");
1656     writeUnittestTsvFile(fpath_data3x0, data3x0);
1657 
1658     /* 3x1 */
1659     string[][] data3x1 =
1660         [["field_a", "field_b", "field_c"],
1661          ["tan", "タン", "8.5"]];
1662 
1663     string fpath_data3x1 = buildPath(testDir, "data3x1.tsv");
1664     string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv");
1665     writeUnittestTsvFile(fpath_data3x1, data3x1);
1666     writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1..$]);
1667 
1668     string[][] data3x1ExpectedReplaceNum3 =
1669         [["field_a", "field_b", "field_c"],
1670          ["tan", "タン", "8.5"],
1671          ["tan", "タン", "8.5"],
1672          ["tan", "タン", "8.5"]];
1673 
1674     /* 3x2 */
1675     string[][] data3x2 =
1676         [["field_a", "field_b", "field_c"],
1677          ["brown", "褐色", "29.2"],
1678          ["gray", "グレー", "6.2"]];
1679 
1680     string fpath_data3x2 = buildPath(testDir, "data3x2.tsv");
1681     string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv");
1682     writeUnittestTsvFile(fpath_data3x2, data3x2);
1683     writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1..$]);
1684 
1685     string[][] data3x2PermuteCompat =
1686         [["field_a", "field_b", "field_c"],
1687          ["gray", "グレー", "6.2"],
1688          ["brown", "褐色", "29.2"]];
1689 
1690     string[][] data3x2PermuteShuffle =
1691         [["field_a", "field_b", "field_c"],
1692          ["gray", "グレー", "6.2"],
1693          ["brown", "褐色", "29.2"]];
1694 
1695     /* 3x3 */
1696     string[][] data3x3 =
1697         [["field_a", "field_b", "field_c"],
1698          ["orange", "オレンジ", "2.5"],
1699          ["pink", "ピンク", "1.1"],
1700          ["purple", "紫の", "42"]];
1701 
1702     string fpath_data3x3 = buildPath(testDir, "data3x3.tsv");
1703     string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv");
1704     writeUnittestTsvFile(fpath_data3x3, data3x3);
1705     writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1..$]);
1706 
1707     string[][] data3x3ExpectedPermuteCompat =
1708         [["field_a", "field_b", "field_c"],
1709          ["purple", "紫の", "42"],
1710          ["pink", "ピンク", "1.1"],
1711          ["orange", "オレンジ", "2.5"]];
1712 
1713     string[][] data3x3ExpectedPermuteSwap =
1714         [["field_a", "field_b", "field_c"],
1715          ["purple", "紫の", "42"],
1716          ["orange", "オレンジ", "2.5"],
1717          ["pink", "ピンク", "1.1"]];
1718 
1719     /* 3x6 */
1720     string[][] data3x6 =
1721         [["field_a", "field_b", "field_c"],
1722          ["red", "赤", "23.8"],
1723          ["green", "緑", "0.0072"],
1724          ["white", "白", "1.65"],
1725          ["yellow", "黄", "12"],
1726          ["blue", "青", "12"],
1727          ["black", "黒", "0.983"]];
1728     string fpath_data3x6 = buildPath(testDir, "data3x6.tsv");
1729     string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv");
1730     writeUnittestTsvFile(fpath_data3x6, data3x6);
1731     writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1..$]);
1732 
1733     // Randomization, all lines
1734     string[][] data3x6ExpectedPermuteCompat =
1735         [["field_a", "field_b", "field_c"],
1736          ["yellow", "黄", "12"],
1737          ["black", "黒", "0.983"],
1738          ["blue", "青", "12"],
1739          ["white", "白", "1.65"],
1740          ["green", "緑", "0.0072"],
1741          ["red", "赤", "23.8"]];
1742 
1743     string[][] data3x6ExpectedPermuteSwap =
1744         [["field_a", "field_b", "field_c"],
1745          ["black", "黒", "0.983"],
1746          ["green", "緑", "0.0072"],
1747          ["red", "赤", "23.8"],
1748          ["yellow", "黄", "12"],
1749          ["white", "白", "1.65"],
1750          ["blue", "青", "12"]];
1751 
1752     string[][] data3x6ExpectedPermuteCompatProbs =
1753         [["random_value", "field_a", "field_b", "field_c"],
1754          ["0.96055546286515892", "yellow", "黄", "12"],
1755          ["0.75710153928957880", "black", "黒", "0.983"],
1756          ["0.52525980887003243", "blue", "青", "12"],
1757          ["0.49287854949943721", "white", "白", "1.65"],
1758          ["0.15929344086907804", "green", "緑", "0.0072"],
1759          ["0.010968807619065046", "red", "赤", "23.8"]];
1760 
1761     /* Note: data3x6ExpectedAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because
1762      * both are effectively the same algorithm given that --num is data length. Both read
1763      * in the full data in order then call randomShuffle.
1764      */
1765     string[][] data3x6ExpectedPermuteAlgoRNum6 =
1766         [["field_a", "field_b", "field_c"],
1767          ["black", "黒", "0.983"],
1768          ["green", "緑", "0.0072"],
1769          ["red", "赤", "23.8"],
1770          ["yellow", "黄", "12"],
1771          ["white", "白", "1.65"],
1772          ["blue", "青", "12"]];
1773 
1774     string[][] data3x6ExpectedPermuteAlgoRNum5 =
1775         [["field_a", "field_b", "field_c"],
1776          ["red", "赤", "23.8"],
1777          ["black", "黒", "0.983"],
1778          ["white", "白", "1.65"],
1779          ["green", "緑", "0.0072"],
1780          ["yellow", "黄", "12"]];
1781 
1782     string[][] data3x6ExpectedPermuteAlgoRNum4 =
1783         [["field_a", "field_b", "field_c"],
1784          ["blue", "青", "12"],
1785          ["green", "緑", "0.0072"],
1786          ["black", "黒", "0.983"],
1787          ["white", "白", "1.65"]];
1788 
1789     string[][] data3x6ExpectedPermuteAlgoRNum3 =
1790         [["field_a", "field_b", "field_c"],
1791          ["red", "赤", "23.8"],
1792          ["black", "黒", "0.983"],
1793          ["green", "緑", "0.0072"]];
1794 
1795     string[][] data3x6ExpectedPermuteAlgoRNum2 =
1796         [["field_a", "field_b", "field_c"],
1797          ["black", "黒", "0.983"],
1798          ["red", "赤", "23.8"]];
1799 
1800     string[][] data3x6ExpectedPermuteAlgoRNum1 =
1801         [["field_a", "field_b", "field_c"],
1802          ["green", "緑", "0.0072"]];
1803 
1804     string[][] data3x6ExpectedBernoulliProbsP100 =
1805         [["random_value", "field_a", "field_b", "field_c"],
1806          ["0.010968807619065046", "red", "赤", "23.8"],
1807          ["0.15929344086907804", "green", "緑", "0.0072"],
1808          ["0.49287854949943721", "white", "白", "1.65"],
1809          ["0.96055546286515892", "yellow", "黄", "12"],
1810          ["0.52525980887003243", "blue", "青", "12"],
1811          ["0.75710153928957880", "black", "黒", "0.983"]];
1812 
1813     string[][] data3x6ExpectedBernoulliCompatProbsP60 =
1814         [["random_value", "field_a", "field_b", "field_c"],
1815          ["0.010968807619065046", "red", "赤", "23.8"],
1816          ["0.15929344086907804", "green", "緑", "0.0072"],
1817          ["0.49287854949943721", "white", "白", "1.65"],
1818          ["0.52525980887003243", "blue", "青", "12"]];
1819 
1820     string[][] data3x6ExpectedBernoulliSkipP40 =
1821         [["field_a", "field_b", "field_c"],
1822          ["red", "赤", "23.8"],
1823          ["green", "緑", "0.0072"],
1824          ["yellow", "黄", "12"]];
1825 
1826     string[][] data3x6ExpectedBernoulliCompatP60 =
1827         [["field_a", "field_b", "field_c"],
1828          ["red", "赤", "23.8"],
1829          ["green", "緑", "0.0072"],
1830          ["white", "白", "1.65"],
1831          ["blue", "青", "12"]];
1832 
1833     string[][] data3x6ExpectedDistinctK1K3P60 =
1834         [["field_a", "field_b", "field_c"],
1835          ["green", "緑", "0.0072"],
1836          ["white", "白", "1.65"],
1837          ["blue", "青", "12"]];
1838 
1839     string[][] data3x6ExpectedDistinctK1K3P60Probs =
1840         [["random_value", "field_a", "field_b", "field_c"],
1841          ["0", "green", "緑", "0.0072"],
1842          ["0", "white", "白", "1.65"],
1843          ["0", "blue", "青", "12"]];
1844 
1845     string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom =
1846         [["custom_random_value_header", "field_a", "field_b", "field_c"],
1847          ["0", "green", "緑", "0.0072"],
1848          ["0", "white", "白", "1.65"],
1849          ["0", "blue", "青", "12"]];
1850 
1851     string[][] data3x6ExpectedDistinctK2P2ProbsInorder =
1852         [["random_value", "field_a", "field_b", "field_c"],
1853          ["1", "red", "赤", "23.8"],
1854          ["0", "green", "緑", "0.0072"],
1855          ["0", "white", "白", "1.65"],
1856          ["1", "yellow", "黄", "12"],
1857          ["3", "blue", "青", "12"],
1858          ["2", "black", "黒", "0.983"]];
1859 
1860     string[][] data3x6ExpectedPermuteWt3Probs =
1861         [["random_value", "field_a", "field_b", "field_c"],
1862          ["0.99665198757645390", "yellow", "黄", "12"],
1863          ["0.94775884809836686", "blue", "青", "12"],
1864          ["0.82728234682286661", "red", "赤", "23.8"],
1865          ["0.75346697377181959", "black", "黒", "0.983"],
1866          ["0.65130103496422487", "white", "白", "1.65"],
1867          ["1.5636943712879866e-111", "green", "緑", "0.0072"]];
1868 
1869     string[][] data3x6ExpectedWt3ProbsInorder =
1870         [["random_value", "field_a", "field_b", "field_c"],
1871          ["0.82728234682286661", "red", "赤", "23.8"],
1872          ["1.5636943712879866e-111", "green", "緑", "0.0072"],
1873          ["0.65130103496422487", "white", "白", "1.65"],
1874          ["0.99665198757645390", "yellow", "黄", "12"],
1875          ["0.94775884809836686", "blue", "青", "12"],
1876          ["0.75346697377181959", "black", "黒", "0.983"]];
1877 
1878     string[][] data3x6ExpectedPermuteWt3 =
1879         [["field_a", "field_b", "field_c"],
1880          ["yellow", "黄", "12"],
1881          ["blue", "青", "12"],
1882          ["red", "赤", "23.8"],
1883          ["black", "黒", "0.983"],
1884          ["white", "白", "1.65"],
1885          ["green", "緑", "0.0072"]];
1886 
1887     string[][] data3x6ExpectedReplaceNum10 =
1888         [["field_a", "field_b", "field_c"],
1889          ["black", "黒", "0.983"],
1890          ["green", "緑", "0.0072"],
1891          ["green", "緑", "0.0072"],
1892          ["red", "赤", "23.8"],
1893          ["yellow", "黄", "12"],
1894          ["red", "赤", "23.8"],
1895          ["white", "白", "1.65"],
1896          ["yellow", "黄", "12"],
1897          ["yellow", "黄", "12"],
1898          ["white", "白", "1.65"],
1899         ];
1900 
1901     string[][] data3x6ExpectedReplaceNum10V77 =
1902         [["field_a", "field_b", "field_c"],
1903          ["black", "黒", "0.983"],
1904          ["red", "赤", "23.8"],
1905          ["black", "黒", "0.983"],
1906          ["yellow", "黄", "12"],
1907          ["green", "緑", "0.0072"],
1908          ["green", "緑", "0.0072"],
1909          ["green", "緑", "0.0072"],
1910          ["yellow", "黄", "12"],
1911          ["blue", "青", "12"],
1912          ["white", "白", "1.65"],
1913         ];
1914 
1915     /* Using a different static seed. */
1916     string[][] data3x6ExpectedPermuteCompatV41Probs =
1917         [["random_value", "field_a", "field_b", "field_c"],
1918          ["0.68057272653095424", "green", "緑", "0.0072"],
1919          ["0.67681624367833138", "blue", "青", "12"],
1920          ["0.32097338931635022", "yellow", "黄", "12"],
1921          ["0.25092361867427826", "red", "赤", "23.8"],
1922          ["0.15535934292711318", "black", "黒", "0.983"],
1923          ["0.046095821075141430", "white", "白", "1.65"]];
1924 
1925     string[][] data3x6ExpectedBernoulliCompatP60V41Probs =
1926         [["random_value", "field_a", "field_b", "field_c"],
1927          ["0.25092361867427826", "red", "赤", "23.8"],
1928          ["0.046095821075141430", "white", "白", "1.65"],
1929          ["0.32097338931635022", "yellow", "黄", "12"],
1930          ["0.15535934292711318", "black", "黒", "0.983"]];
1931 
1932     string[][] data3x6ExpectedPermuteWt3V41Probs =
1933         [["random_value", "field_a", "field_b", "field_c"],
1934          ["0.96799377498910666", "blue", "青", "12"],
1935          ["0.94356245792573568", "red", "赤", "23.8"],
1936          ["0.90964601024271996", "yellow", "黄", "12"],
1937          ["0.15491658409260103", "white", "白", "1.65"],
1938          ["0.15043620392537033", "black", "黒", "0.983"],
1939          ["6.1394674830701461e-24", "green", "緑", "0.0072"]];
1940 
1941     string[][] data3x6ExpectedWt3V41ProbsInorder =
1942         [["random_value", "field_a", "field_b", "field_c"],
1943          ["0.94356245792573568", "red", "赤", "23.8"],
1944          ["6.1394674830701461e-24", "green", "緑", "0.0072"],
1945          ["0.15491658409260103", "white", "白", "1.65"],
1946          ["0.90964601024271996", "yellow", "黄", "12"],
1947          ["0.96799377498910666", "blue", "青", "12"],
1948          ["0.15043620392537033", "black", "黒", "0.983"]];
1949 
1950 
1951     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
1952     string[][] combo1ExpectedPermuteCompat =
1953         [["field_a", "field_b", "field_c"],
1954          ["yellow", "黄", "12"],
1955          ["tan", "タン", "8.5"],
1956          ["brown", "褐色", "29.2"],
1957          ["green", "緑", "0.0072"],
1958          ["red", "赤", "23.8"],
1959          ["purple", "紫の", "42"],
1960          ["black", "黒", "0.983"],
1961          ["white", "白", "1.65"],
1962          ["gray", "グレー", "6.2"],
1963          ["blue", "青", "12"],
1964          ["pink", "ピンク", "1.1"],
1965          ["orange", "オレンジ", "2.5"]];
1966 
1967     string[][] combo1ExpectedPermuteCompatProbs =
1968         [["random_value", "field_a", "field_b", "field_c"],
1969          ["0.97088520275428891", "yellow", "黄", "12"],
1970          ["0.96055546286515892", "tan", "タン", "8.5"],
1971          ["0.81756894313730299", "brown", "褐色", "29.2"],
1972          ["0.75710153928957880", "green", "緑", "0.0072"],
1973          ["0.52525980887003243", "red", "赤", "23.8"],
1974          ["0.49287854949943721", "purple", "紫の", "42"],
1975          ["0.47081507067196071", "black", "黒", "0.983"],
1976          ["0.38388182921335101", "white", "白", "1.65"],
1977          ["0.29215990612283349", "gray", "グレー", "6.2"],
1978          ["0.24033216014504433", "blue", "青", "12"],
1979          ["0.15929344086907804", "pink", "ピンク", "1.1"],
1980          ["0.010968807619065046", "orange", "オレンジ", "2.5"]];
1981 
1982     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
1983     string[][] combo1ExpectedProbsInorder =
1984         [["random_value", "field_a", "field_b", "field_c"],
1985          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
1986          ["0.15929344086907804", "pink", "ピンク", "1.1"],
1987          ["0.49287854949943721", "purple", "紫の", "42"],
1988          ["0.96055546286515892", "tan", "タン", "8.5"],
1989          ["0.52525980887003243", "red", "赤", "23.8"],
1990          ["0.75710153928957880", "green", "緑", "0.0072"],
1991          ["0.38388182921335101", "white", "白", "1.65"],
1992          ["0.97088520275428891", "yellow", "黄", "12"],
1993          ["0.24033216014504433", "blue", "青", "12"],
1994          ["0.47081507067196071", "black", "黒", "0.983"],
1995          ["0.81756894313730299", "brown", "褐色", "29.2"],
1996          ["0.29215990612283349", "gray", "グレー", "6.2"]];
1997 
1998     string[][] combo1ExpectedBernoulliCompatP50Probs =
1999         [["random_value", "field_a", "field_b", "field_c"],
2000          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
2001          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2002          ["0.49287854949943721", "purple", "紫の", "42"],
2003          ["0.38388182921335101", "white", "白", "1.65"],
2004          ["0.24033216014504433", "blue", "青", "12"],
2005          ["0.47081507067196071", "black", "黒", "0.983"],
2006          ["0.29215990612283349", "gray", "グレー", "6.2"]];
2007 
2008     string[][] combo1ExpectedBernoulliCompatP40 =
2009         [["field_a", "field_b", "field_c"],
2010          ["orange", "オレンジ", "2.5"],
2011          ["pink", "ピンク", "1.1"],
2012          ["white", "白", "1.65"],
2013          ["blue", "青", "12"],
2014          ["gray", "グレー", "6.2"]];
2015 
2016     string[][] combo1ExpectedDistinctK1P40 =
2017         [["field_a", "field_b", "field_c"],
2018          ["orange", "オレンジ", "2.5"],
2019          ["red", "赤", "23.8"],
2020          ["green", "緑", "0.0072"],
2021          ["blue", "青", "12"],
2022          ["black", "黒", "0.983"]];
2023 
2024     string[][] combo1ExpectedPermuteWt3Probs =
2025         [["random_value", "field_a", "field_b", "field_c"],
2026          ["0.99754077523718754", "yellow", "黄", "12"],
2027          ["0.99527665440088786", "tan", "タン", "8.5"],
2028          ["0.99312578945741659", "brown", "褐色", "29.2"],
2029          ["0.98329602553389361", "purple", "紫の", "42"],
2030          ["0.97330961938083660", "red", "赤", "23.8"],
2031          ["0.88797551521739648", "blue", "青", "12"],
2032          ["0.81999230489041786", "gray", "グレー", "6.2"],
2033          ["0.55975569204250941", "white", "白", "1.65"],
2034          ["0.46472135609205739", "black", "黒", "0.983"],
2035          ["0.18824582704191337", "pink", "ピンク", "1.1"],
2036          ["0.16446131853299920", "orange", "オレンジ", "2.5"],
2037          ["1.6438086931020549e-17", "green", "緑", "0.0072"]];
2038 
2039     string[][] combo1ExpectedPermuteWt3 =
2040         [["field_a", "field_b", "field_c"],
2041          ["yellow", "黄", "12"],
2042          ["tan", "タン", "8.5"],
2043          ["brown", "褐色", "29.2"],
2044          ["purple", "紫の", "42"],
2045          ["red", "赤", "23.8"],
2046          ["blue", "青", "12"],
2047          ["gray", "グレー", "6.2"],
2048          ["white", "白", "1.65"],
2049          ["black", "黒", "0.983"],
2050          ["pink", "ピンク", "1.1"],
2051          ["orange", "オレンジ", "2.5"],
2052          ["green", "緑", "0.0072"]];
2053 
2054         string[][] combo1ExpectedPermuteAlgoRNum4 =
2055         [["field_a", "field_b", "field_c"],
2056          ["blue", "青", "12"],
2057          ["gray", "グレー", "6.2"],
2058          ["brown", "褐色", "29.2"],
2059          ["white", "白", "1.65"]];
2060 
2061     string[][] combo1ExpectedReplaceNum10 =
2062         [["field_a", "field_b", "field_c"],
2063          ["gray", "グレー", "6.2"],
2064          ["yellow", "黄", "12"],
2065          ["yellow", "黄", "12"],
2066          ["white", "白", "1.65"],
2067          ["tan", "タン", "8.5"],
2068          ["white", "白", "1.65"],
2069          ["blue", "青", "12"],
2070          ["black", "黒", "0.983"],
2071          ["tan", "タン", "8.5"],
2072          ["purple", "紫の", "42"]];
2073 
2074     /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */
2075     string[][] data1x200 =
2076         [["field_a"],
2077          ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"],
2078          ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"],
2079          ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"],
2080          ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"],
2081          ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"],
2082          ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"],
2083          ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"],
2084          ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"],
2085          ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"],
2086          ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"],
2087          ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"],
2088          ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"],
2089          ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"],
2090          ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"],
2091          ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"],
2092          ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"],
2093          ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"],
2094          ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"],
2095          ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"],
2096          ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"],
2097         ];
2098 
2099     string fpath_data1x200 = buildPath(testDir, "data1x200.tsv");
2100     string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv");
2101     writeUnittestTsvFile(fpath_data1x200, data1x200);
2102     writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1..$]);
2103 
2104     string[][] data1x200ExpectedBernoulliSkipV333P01 =
2105         [["field_a"],
2106          ["077"],
2107          ["119"]];
2108 
2109     string[][] data1x200ExpectedBernoulliSkipV333P02 =
2110         [["field_a"],
2111          ["038"],
2112          ["059"],
2113          ["124"],
2114          ["161"],
2115          ["162"],
2116          ["183"]];
2117 
2118     string[][] data1x200ExpectedBernoulliSkipV333P03 =
2119         [["field_a"],
2120          ["025"],
2121          ["039"],
2122          ["082"],
2123          ["107"],
2124          ["108"],
2125          ["122"],
2126          ["136"],
2127          ["166"],
2128          ["182"]];
2129 
2130     string[][] data1x200ExpectedBernoulliCompatV333P01 =
2131         [["field_a"],
2132          ["072"]];
2133 
2134     string[][] data1x200ExpectedBernoulliCompatV333P02 =
2135         [["field_a"],
2136          ["004"],
2137          ["072"]];
2138 
2139     string[][] data1x200ExpectedBernoulliCompatV333P03 =
2140         [["field_a"],
2141          ["004"],
2142          ["072"],
2143          ["181"]];
2144 
2145     /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files,
2146      * only expected results. The header is from 3x0, the results are offset 1-position
2147      * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line.
2148      */
2149     string[][] combo2ExpectedBernoulliSkipV333P03 =
2150         [["field_a", "field_b", "field_c"],
2151          ["024"],
2152          ["038"],
2153          ["081"],
2154          ["106"],
2155          ["107"],
2156          ["121"],
2157          ["135"],
2158          ["165"],
2159          ["181"]];
2160 
2161 
2162     /* 1x10 - Simple 1-column file. */
2163     string[][] data1x10 =
2164         [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]];
2165     string fpath_data1x10 = buildPath(testDir, "data1x10.tsv");
2166     string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv");
2167     writeUnittestTsvFile(fpath_data1x10, data1x10);
2168     writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1..$]);
2169 
2170     string[][] data1x10ExpectedPermuteCompat =
2171         [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]];
2172 
2173     string[][] data1x10ExpectedPermuteWt1 =
2174         [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]];
2175 
2176     /* 2x10a - Uniform distribution [0,1]. */
2177     string[][] data2x10a =
2178         [["line", "weight"],
2179          ["1", "0.26788837"],
2180          ["2", "0.06601298"],
2181          ["3", "0.38627527"],
2182          ["4", "0.47379424"],
2183          ["5", "0.02966641"],
2184          ["6", "0.05636231"],
2185          ["7", "0.70529242"],
2186          ["8", "0.91836862"],
2187          ["9", "0.99103720"],
2188          ["10", "0.31401740"]];
2189 
2190     string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv");
2191     writeUnittestTsvFile(fpath_data2x10a, data2x10a);
2192 
2193     string[][] data2x10aExpectedPermuteWt2Probs =
2194         [["random_value", "line", "weight"],
2195          ["0.96833865494543658", "8", "0.91836862"],
2196          ["0.91856842054413923", "4", "0.47379424"],
2197          ["0.25730832087795091", "7", "0.70529242"],
2198          ["0.23725317907018120", "9", "0.99103720"],
2199          ["0.16016096701872204", "3", "0.38627527"],
2200          ["0.090819662667243381", "10", "0.31401740"],
2201          ["0.0071764539244361172", "6", "0.05636231"],
2202          ["0.000000048318642951630057", "1", "0.26788837"],
2203          ["0.00000000037525692966535517", "5", "0.02966641"],
2204          ["8.2123247880095796e-13", "2", "0.06601298"]];
2205 
2206     /* 2x10b - Uniform distribution [0,1000]. */
2207     string[][] data2x10b =
2208         [["line", "weight"],
2209          ["1", "761"],
2210          ["2", "432"],
2211          ["3", "103"],
2212          ["4", "448"],
2213          ["5", "750"],
2214          ["6", "711"],
2215          ["7", "867"],
2216          ["8", "841"],
2217          ["9", "963"],
2218          ["10", "784"]];
2219 
2220     string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv");
2221     writeUnittestTsvFile(fpath_data2x10b, data2x10b);
2222 
2223     string[][] data2x10bExpectedPermuteWt2Probs =
2224         [["random_value", "line", "weight"],
2225          ["0.99996486739067969", "8", "841"],
2226          ["0.99991017467137211", "4", "448"],
2227          ["0.99960871524873662", "6", "711"],
2228          ["0.99914188537143800", "5", "750"],
2229          ["0.99903963250274785", "10", "784"],
2230          ["0.99889631825931946", "7", "867"],
2231          ["0.99852058315191139", "9", "963"],
2232          ["0.99575669679158918", "2", "432"],
2233          ["0.99408758732050595", "1", "761"],
2234          ["0.99315467761212362", "3", "103"]];
2235 
2236     /* 2x10c - Logarithmic distribution in random order. */
2237     string[][] data2x10c =
2238         [["line", "weight"],
2239          ["1", "31.85"],
2240          ["2", "17403.31"],
2241          ["3", "653.84"],
2242          ["4", "8.23"],
2243          ["5", "2671.04"],
2244          ["6", "26226.08"],
2245          ["7", "1.79"],
2246          ["8", "354.56"],
2247          ["9", "35213.81"],
2248          ["10", "679.29"]];
2249 
2250     string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv");
2251     writeUnittestTsvFile(fpath_data2x10c, data2x10c);
2252 
2253     string[][] data2x10cExpectedPermuteWt2Probs =
2254         [["random_value", "line", "weight"],
2255          ["0.99998939008709697", "6", "26226.08"],
2256          ["0.99995951291695517", "9", "35213.81"],
2257          ["0.99991666907613541", "8", "354.56"],
2258          ["0.99989445052186410", "2", "17403.31"],
2259          ["0.99975897602861630", "5", "2671.04"],
2260          ["0.99891852769877643", "3", "653.84"],
2261          ["0.99889167752782515", "10", "679.29"],
2262          ["0.99512207506850148", "4", "8.23"],
2263          ["0.86789371584259023", "1", "31.85"],
2264          ["0.58574438162915610", "7", "1.79"]];
2265 
2266     /* 2x10d. Logarithmic distribution in ascending order. */
2267     string[][] data2x10d =
2268         [["line", "weight"],
2269          ["1", "1.79"],
2270          ["2", "8.23"],
2271          ["3", "31.85"],
2272          ["4", "354.56"],
2273          ["5", "653.84"],
2274          ["6", "679.29"],
2275          ["7", "2671.04"],
2276          ["8", "17403.31"],
2277          ["9", "26226.08"],
2278          ["10", "35213.81"]];
2279 
2280     string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv");
2281     writeUnittestTsvFile(fpath_data2x10d, data2x10d);
2282 
2283     string[][] data2x10dExpectedPermuteWt2Probs =
2284         [["random_value", "line", "weight"],
2285          ["0.99999830221846353", "8", "17403.31"],
2286          ["0.99997860834041397", "10", "35213.81"],
2287          ["0.99994563828986716", "9", "26226.08"],
2288          ["0.99988650363575737", "4", "354.56"],
2289          ["0.99964161939190088", "7", "2671.04"],
2290          ["0.99959045338948649", "6", "679.29"],
2291          ["0.99901574490639788", "5", "653.84"],
2292          ["0.97803163304747431", "3", "31.85"],
2293          ["0.79994791806910948", "2", "8.23"],
2294          ["0.080374261239949119", "1", "1.79"]];
2295 
2296     /* 2x10e. Logarithmic distribution in descending order. */
2297     string[][] data2x10e =
2298         [["line", "weight"],
2299          ["1", "35213.81"],
2300          ["2", "26226.08"],
2301          ["3", "17403.31"],
2302          ["4", "2671.04"],
2303          ["5", "679.29"],
2304          ["6", "653.84"],
2305          ["7", "354.56"],
2306          ["8", "31.85"],
2307          ["9", "8.23"],
2308          ["10", "1.79"]];
2309     string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv");
2310     writeUnittestTsvFile(fpath_data2x10e, data2x10e);
2311 
2312     string[][] data2x10eExpectedPermuteWt2Probs =
2313         [["random_value", "line", "weight"],
2314          ["0.99998493348975237", "4", "2671.04"],
2315          ["0.99995934807202624", "3", "17403.31"],
2316          ["0.99992995739727453", "2", "26226.08"],
2317          ["0.99987185679245649", "1", "35213.81"],
2318          ["0.99957451563173938", "6", "653.84"],
2319          ["0.99907273650209583", "8", "31.85"],
2320          ["0.99905260312968946", "5", "679.29"],
2321          ["0.99730333650516401", "7", "354.56"],
2322          ["0.84093902435227808", "9", "8.23"],
2323          ["0.65650015926290028", "10", "1.79"]];
2324 
2325     /* Data sets for distinct sampling. */
2326     string[][] data5x25 =
2327         [["ID", "Shape", "Color", "Size", "Weight"],
2328          ["01", "circle", "red", "S", "10"],
2329          ["02", "circle", "black", "L", "20"],
2330          ["03", "square", "black", "L", "20"],
2331          ["04", "circle", "green", "L", "30"],
2332          ["05", "ellipse", "red", "S", "20"],
2333          ["06", "triangle", "red", "S", "10"],
2334          ["07", "triangle", "red", "L", "20"],
2335          ["08", "square", "black", "S", "10"],
2336          ["09", "circle", "black", "S", "20"],
2337          ["10", "square", "green", "L", "20"],
2338          ["11", "triangle", "red", "L", "20"],
2339          ["12", "circle", "green", "L", "30"],
2340          ["13", "ellipse", "red", "S", "20"],
2341          ["14", "circle", "green", "L", "30"],
2342          ["15", "ellipse", "red", "L", "30"],
2343          ["16", "square", "red", "S", "10"],
2344          ["17", "circle", "black", "L", "20"],
2345          ["18", "square", "red", "S", "20"],
2346          ["19", "square", "black", "L", "20"],
2347          ["20", "circle", "red", "S", "10"],
2348          ["21", "ellipse", "black", "L", "30"],
2349          ["22", "triangle", "red", "L", "30"],
2350          ["23", "circle", "green", "S", "20"],
2351          ["24", "square", "green", "L", "20"],
2352          ["25", "circle", "red", "S", "10"],
2353         ];
2354 
2355     string fpath_data5x25 = buildPath(testDir, "data5x25.tsv");
2356     string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv");
2357     writeUnittestTsvFile(fpath_data5x25, data5x25);
2358     writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1..$]);
2359 
2360     string[][] data5x25ExpectedDistinctK2P40 =
2361         [["ID", "Shape", "Color", "Size", "Weight"],
2362          ["03", "square", "black", "L", "20"],
2363          ["05", "ellipse", "red", "S", "20"],
2364          ["08", "square", "black", "S", "10"],
2365          ["10", "square", "green", "L", "20"],
2366          ["13", "ellipse", "red", "S", "20"],
2367          ["15", "ellipse", "red", "L", "30"],
2368          ["16", "square", "red", "S", "10"],
2369          ["18", "square", "red", "S", "20"],
2370          ["19", "square", "black", "L", "20"],
2371          ["21", "ellipse", "black", "L", "30"],
2372          ["24", "square", "green", "L", "20"],
2373         ];
2374 
2375     string[][] data5x25ExpectedDistinctK2K4P20 =
2376         [["ID", "Shape", "Color", "Size", "Weight"],
2377          ["03", "square", "black", "L", "20"],
2378          ["07", "triangle", "red", "L", "20"],
2379          ["08", "square", "black", "S", "10"],
2380          ["10", "square", "green", "L", "20"],
2381          ["11", "triangle", "red", "L", "20"],
2382          ["16", "square", "red", "S", "10"],
2383          ["18", "square", "red", "S", "20"],
2384          ["19", "square", "black", "L", "20"],
2385          ["22", "triangle", "red", "L", "30"],
2386          ["24", "square", "green", "L", "20"],
2387         ];
2388 
2389     string[][] data5x25ExpectedDistinctK2K3K4P20 =
2390         [["ID", "Shape", "Color", "Size", "Weight"],
2391          ["04", "circle", "green", "L", "30"],
2392          ["07", "triangle", "red", "L", "20"],
2393          ["09", "circle", "black", "S", "20"],
2394          ["11", "triangle", "red", "L", "20"],
2395          ["12", "circle", "green", "L", "30"],
2396          ["14", "circle", "green", "L", "30"],
2397          ["16", "square", "red", "S", "10"],
2398          ["18", "square", "red", "S", "20"],
2399          ["22", "triangle", "red", "L", "30"],
2400         ];
2401 
2402     /* Fields 2 and 4 from data5x25. Distinct rows should be the same for equiv keys. */
2403     string[][] data2x25 =
2404         [["Shape", "Size"],
2405          ["circle", "S"],
2406          ["circle", "L"],
2407          ["square", "L"],
2408          ["circle", "L"],
2409          ["ellipse", "S"],
2410          ["triangle", "S"],
2411          ["triangle", "L"],
2412          ["square", "S"],
2413          ["circle", "S"],
2414          ["square", "L"],
2415          ["triangle", "L"],
2416          ["circle", "L"],
2417          ["ellipse", "S"],
2418          ["circle", "L"],
2419          ["ellipse", "L"],
2420          ["square", "S"],
2421          ["circle", "L"],
2422          ["square", "S"],
2423          ["square", "L"],
2424          ["circle", "S"],
2425          ["ellipse", "L"],
2426          ["triangle", "L"],
2427          ["circle", "S"],
2428          ["square", "L"],
2429          ["circle", "S"],
2430         ];
2431 
2432     string fpath_data2x25 = buildPath(testDir, "data2x25.tsv");
2433     string fpath_data2x25_noheader = buildPath(testDir, "data2x25_noheader.tsv");
2434     writeUnittestTsvFile(fpath_data2x25, data2x25);
2435     writeUnittestTsvFile(fpath_data2x25_noheader, data2x25[1..$]);
2436 
2437     string[][] data2x25ExpectedDistinctK1K2P20 =
2438         [["Shape", "Size"],
2439          ["square", "L"],
2440          ["triangle", "L"],
2441          ["square", "S"],
2442          ["square", "L"],
2443          ["triangle", "L"],
2444          ["square", "S"],
2445          ["square", "S"],
2446          ["square", "L"],
2447          ["triangle", "L"],
2448          ["square", "L"],
2449         ];
2450 
2451     string[][] data1x25 =
2452         [["Shape-Size"],
2453          ["circle-S"],
2454          ["circle-L"],
2455          ["square-L"],
2456          ["circle-L"],
2457          ["ellipse-S"],
2458          ["triangle-S"],
2459          ["triangle-L"],
2460          ["square-S"],
2461          ["circle-S"],
2462          ["square-L"],
2463          ["triangle-L"],
2464          ["circle-L"],
2465          ["ellipse-S"],
2466          ["circle-L"],
2467          ["ellipse-L"],
2468          ["square-S"],
2469          ["circle-L"],
2470          ["square-S"],
2471          ["square-L"],
2472          ["circle-S"],
2473          ["ellipse-L"],
2474          ["triangle-L"],
2475          ["circle-S"],
2476          ["square-L"],
2477          ["circle-S"],
2478         ];
2479 
2480     string fpath_data1x25 = buildPath(testDir, "data1x25.tsv");
2481     string fpath_data1x25_noheader = buildPath(testDir, "data1x25_noheader.tsv");
2482     writeUnittestTsvFile(fpath_data1x25, data1x25);
2483     writeUnittestTsvFile(fpath_data1x25_noheader, data1x25[1..$]);
2484 
2485     string[][] data1x25ExpectedDistinctK1P20 =
2486         [["Shape-Size"],
2487          ["triangle-L"],
2488          ["square-S"],
2489          ["triangle-L"],
2490          ["ellipse-L"],
2491          ["square-S"],
2492          ["square-S"],
2493          ["ellipse-L"],
2494          ["triangle-L"],
2495         ];
2496 
2497     string[][] data1x25ExpectedDistinctK1P20Probs =
2498         [["random_value", "Shape-Size"],
2499          ["0", "triangle-L"],
2500          ["0", "square-S"],
2501          ["0", "triangle-L"],
2502          ["0", "ellipse-L"],
2503          ["0", "square-S"],
2504          ["0", "square-S"],
2505          ["0", "ellipse-L"],
2506          ["0", "triangle-L"],
2507         ];
2508 
2509     string[][] data1x25ExpectedDistinctK1P20ProbsInorder =
2510         [["random_value", "Shape-Size"],
2511          ["1", "circle-S"],
2512          ["4", "circle-L"],
2513          ["2", "square-L"],
2514          ["4", "circle-L"],
2515          ["2", "ellipse-S"],
2516          ["1", "triangle-S"],
2517          ["0", "triangle-L"],
2518          ["0", "square-S"],
2519          ["1", "circle-S"],
2520          ["2", "square-L"],
2521          ["0", "triangle-L"],
2522          ["4", "circle-L"],
2523          ["2", "ellipse-S"],
2524          ["4", "circle-L"],
2525          ["0", "ellipse-L"],
2526          ["0", "square-S"],
2527          ["4", "circle-L"],
2528          ["0", "square-S"],
2529          ["2", "square-L"],
2530          ["1", "circle-S"],
2531          ["0", "ellipse-L"],
2532          ["0", "triangle-L"],
2533          ["1", "circle-S"],
2534          ["2", "square-L"],
2535          ["1", "circle-S"],
2536         ];
2537 
2538     /*
2539      * Enough setup! Actually run some tests!
2540      */
2541 
2542     /* Permutations. Headers, static seed, compatibility mode. With weights and without. */
2543     testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty);
2544     testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0);
2545     testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1);
2546     testTsvSample(["test-a4", "-H", "-s", "--compatibility-mode", fpath_data3x2], data3x2PermuteCompat);
2547     testTsvSample(["test-a5", "-H", "-s", "--compatibility-mode", fpath_data3x3], data3x3ExpectedPermuteCompat);
2548     testTsvSample(["test-a6", "-H", "-s", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat);
2549     testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
2550     testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3);
2551     testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
2552     testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
2553     testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
2554     testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
2555     testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs);
2556 
2557     /* Permutations, without compatibility mode, or with both compatibility and printing. */
2558     testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty);
2559     testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0);
2560     testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1);
2561     testTsvSample(["test-aa4", "-H", "-s", fpath_data3x2], data3x2PermuteShuffle);
2562     testTsvSample(["test-aa5", "-H", "-s", fpath_data3x3], data3x3ExpectedPermuteSwap);
2563     testTsvSample(["test-aa6", "-H", "-s", fpath_data3x6], data3x6ExpectedPermuteSwap);
2564     testTsvSample(["test-aa7", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3);
2565     testTsvSample(["test-aa8", "-H", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
2566     testTsvSample(["test-aa9", "-H", "--seed-value", "41", "--print-random", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
2567 
2568     /* Reservoir sampling using Algorithm R.
2569      * (Note: reservoirSamplingViaHeap is tested later in the length-based iteration loops.)
2570      */
2571     testTsvSample(["test-aa10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty);
2572     testTsvSample(["test-aa11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty);
2573     testTsvSample(["test-aa12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x0], data3x0);
2574     testTsvSample(["test-aa13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x0], data3x0);
2575     testTsvSample(["test-aa14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x1], data3x1);
2576     testTsvSample(["test-aa15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x1], data3x1);
2577     testTsvSample(["test-aa16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum6);
2578     testTsvSample(["test-aa17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum6);
2579     testTsvSample(["test-aa18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum5);
2580     testTsvSample(["test-aa19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum4);
2581     testTsvSample(["test-aa20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum3);
2582     testTsvSample(["test-aa21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum2);
2583     testTsvSample(["test-aa22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum1);
2584 
2585     /* Bernoulli sampling cases. */
2586     testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty);
2587     testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0);
2588     testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1);
2589     testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6);
2590     testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6);
2591     testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
2592     testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60);
2593     testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliCompatP60);
2594     testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs);
2595 
2596     /* Bernoulli sampling with probabilities in skip sampling range or preferring skip sampling. */
2597     testTsvSample(["test-ab1", "-H", "--seed-value", "333", "--prob", "0.01", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P01);
2598     testTsvSample(["test-ab2", "-H", "--seed-value", "333", "--prob", "0.02", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P02);
2599     testTsvSample(["test-ab3", "-H", "--seed-value", "333", "--prob", "0.03", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03);
2600     testTsvSample(["test-ab4", "-H", "--seed-value", "333", "--prob", "0.01", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P01);
2601     testTsvSample(["test-ab5", "-H", "--seed-value", "333", "--prob", "0.02", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P02);
2602     testTsvSample(["test-ab6", "-H", "--seed-value", "333", "--prob", "0.03", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P03);
2603     testTsvSample(["test-ab7", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40);
2604 
2605     /* Distinct sampling cases. */
2606     testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty);
2607     testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0);
2608     testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1);
2609     testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6);
2610     testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctK1K3P60);
2611 
2612 
2613 
2614     /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling.
2615      * For weighted sampling, use the weighted cases, but with expected using the original ordering.
2616      */
2617     testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
2618     testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
2619     testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
2620                   data3x6ExpectedWt3ProbsInorder);
2621     testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
2622                   data3x6ExpectedWt3V41ProbsInorder);
2623     testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6],
2624                   data3x6ExpectedDistinctK1K3P60Probs);
2625     testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header",
2626                    "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctK1K3P60ProbsRVCustom);
2627     testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6],
2628                   data3x6ExpectedDistinctK2P2ProbsInorder);
2629 
2630     /* Simple random sampling with replacement. */
2631     testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty);
2632     testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty);
2633     testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0);
2634     testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0);
2635     testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplaceNum3);
2636     testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10);
2637     testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77);
2638 
2639     /* Permutations, compatibility mode, without headers. */
2640     testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1..$]);
2641     testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1..$]);
2642     testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1..$]);
2643     testTsvSample(["test-b4", "-s", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..$]);
2644     testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..$]);
2645     testTsvSample(["test-b6", "-s", "--weight-field", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..$]);
2646     testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..$]);
2647     testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1..$]);
2648     testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1..$]);
2649 
2650     /* Permutations, no headers, without compatibility mode, or with printing and compatibility mode. */
2651     testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1..$]);
2652     testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1..$]);
2653     testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1..$]);
2654     testTsvSample(["test-bb4", "-s", fpath_data3x6_noheader], data3x6ExpectedPermuteSwap[1..$]);
2655     testTsvSample(["test-bb5", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..$]);
2656     testTsvSample(["test-bb6", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..$]);
2657     testTsvSample(["test-bb7", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1..$]);
2658 
2659     /* Reservoir sampling using Algorithm R, no headers. */
2660     testTsvSample(["test-aa10", "--prefer-algorithm-r", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty);
2661     testTsvSample(["test-aa11", "--prefer-algorithm-r", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty);
2662     testTsvSample(["test-aa14", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x1_noheader], data3x1[1..$]);
2663     testTsvSample(["test-aa15", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x1_noheader], data3x1[1..$]);
2664     testTsvSample(["test-aa16", "--prefer-algorithm-r", "-s", "--num", "7", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum6[1..$]);
2665     testTsvSample(["test-aa17", "--prefer-algorithm-r", "-s", "--num", "6", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum6[1..$]);
2666     testTsvSample(["test-aa18", "--prefer-algorithm-r", "-s", "--num", "5", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum5[1..$]);
2667     testTsvSample(["test-aa19", "--prefer-algorithm-r", "-s", "--num", "4", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum4[1..$]);
2668     testTsvSample(["test-aa20", "--prefer-algorithm-r", "-s", "--num", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum3[1..$]);
2669     testTsvSample(["test-aa21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum2[1..$]);
2670     testTsvSample(["test-aa22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum1[1..$]);
2671 
2672     /* Bernoulli sampling cases. */
2673     testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1..$]);
2674     testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]);
2675     testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]);
2676     testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..$]);
2677     testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..$]);
2678     testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1..$]);
2679 
2680     /* Bernoulli sampling with probabilities in skip sampling range. */
2681     testTsvSample(["test-bb1", "-v", "333", "-p", "0.01", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P01[1..$]);
2682     testTsvSample(["test-bb2", "-v", "333", "-p", "0.02", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P02[1..$]);
2683     testTsvSample(["test-bb3", "-v", "333", "-p", "0.03", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..$]);
2684     testTsvSample(["test-bb4", "-v", "333", "-p", "0.01", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P01[1..$]);
2685     testTsvSample(["test-bb5", "-v", "333", "-p", "0.02", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P02[1..$]);
2686     testTsvSample(["test-bb6", "-v", "333", "-p", "0.03", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P03[1..$]);
2687     testTsvSample(["test-bb7", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1..$]);
2688 
2689     /* Distinct sampling cases. */
2690     testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1..$]);
2691     testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]);
2692     testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]);
2693     testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]);
2694 
2695     /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */
2696     testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..$]);
2697     testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1..$]);
2698     testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader],
2699                   data3x6ExpectedDistinctK1K3P60Probs[1..$]);
2700     testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader],
2701                   data3x6ExpectedDistinctK2P2ProbsInorder[1..$]);
2702 
2703     /* Simple random sampling with replacement. */
2704     testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty);
2705     testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty);
2706     testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplaceNum3[1..$]);
2707     testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10[1..$]);
2708     testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1..$]);
2709 
2710     /* Multi-file tests. */
2711     testTsvSample(["test-c1", "--header", "--static-seed", "--compatibility-mode",
2712                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2713                   combo1ExpectedPermuteCompat);
2714     testTsvSample(["test-c2", "--header", "--static-seed", "--print-random",
2715                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2716                   combo1ExpectedPermuteCompatProbs);
2717     testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3",
2718                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2719                   combo1ExpectedPermuteWt3Probs);
2720     testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", "--compatibility-mode",
2721                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2722                   combo1ExpectedPermuteWt3);
2723     testTsvSample(["test-c5", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4",
2724                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2725                   combo1ExpectedPermuteAlgoRNum4);
2726 
2727     /* Multi-file, no headers. */
2728     testTsvSample(["test-c6", "--static-seed", "--compatibility-mode",
2729                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2730                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2731                   combo1ExpectedPermuteCompat[1..$]);
2732     testTsvSample(["test-c7", "--static-seed", "--print-random",
2733                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2734                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2735                   combo1ExpectedPermuteCompatProbs[1..$]);
2736     testTsvSample(["test-c8", "--static-seed", "--print-random", "--weight-field", "3",
2737                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2738                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2739                   combo1ExpectedPermuteWt3Probs[1..$]);
2740     testTsvSample(["test-c9", "--static-seed", "--weight-field", "3", "--compatibility-mode",
2741                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2742                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2743                   combo1ExpectedPermuteWt3[1..$]);
2744     testTsvSample(["test-c10", "--static-seed", "--prefer-algorithm-r", "--num", "4",
2745                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2746                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2747                   combo1ExpectedPermuteAlgoRNum4[1..$]);
2748 
2749     /* Bernoulli sampling cases. */
2750     testTsvSample(["test-c11", "--header", "--static-seed", "--print-random", "--prob", ".5",
2751                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2752                   combo1ExpectedBernoulliCompatP50Probs);
2753     testTsvSample(["test-c12", "--header", "--static-seed", "--prob", ".4",
2754                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2755                   combo1ExpectedBernoulliCompatP40);
2756     testTsvSample(["test-c13", "--static-seed", "--print-random", "--prob", ".5",
2757                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2758                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2759                   combo1ExpectedBernoulliCompatP50Probs[1..$]);
2760     testTsvSample(["test-c14", "--static-seed", "--prob", ".4",
2761                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2762                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2763                   combo1ExpectedBernoulliCompatP40[1..$]);
2764 
2765     /* Bernoulli sampling with probabilities in skip sampling range. */
2766     testTsvSample(["test-cc1", "-H", "-v", "333", "-p", "0.03",
2767                    fpath_data3x0, fpath_data3x1, fpath_data1x200, fpath_dataEmpty, fpath_data1x10],
2768                   combo2ExpectedBernoulliSkipV333P03);
2769     testTsvSample(["test-cc1", "-v", "333", "-p", "0.03",
2770                    fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader],
2771                   combo2ExpectedBernoulliSkipV333P03[1..$]);
2772 
2773     /* Distinct sampling cases. */
2774     testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4",
2775                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2776                   combo1ExpectedDistinctK1P40);
2777     testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--prob", ".4",
2778                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2779                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2780                   combo1ExpectedDistinctK1P40[1..$]);
2781 
2782     /* Generating random weights. */
2783     testTsvSample(["test-c15", "--header", "--static-seed", "--gen-random-inorder",
2784                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2785                   combo1ExpectedProbsInorder);
2786     testTsvSample(["test-c16", "--static-seed", "--gen-random-inorder",
2787                    fpath_data3x3_noheader, fpath_data3x1_noheader,
2788                    fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader],
2789                   combo1ExpectedProbsInorder[1..$]);
2790 
2791     /* Simple random sampling with replacement. */
2792     testTsvSample(["test-c17", "--header", "--static-seed", "--replace", "--num", "10",
2793                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2794                   combo1ExpectedReplaceNum10);
2795 
2796     testTsvSample(["test-c18", "--static-seed", "--replace", "--num", "10",
2797                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2798                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2799                   combo1ExpectedReplaceNum10[1..$]);
2800 
2801     /* Single column file. */
2802     testTsvSample(["test-d1", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat);
2803     testTsvSample(["test-d2", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat);
2804 
2805     /* Distributions. */
2806     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs);
2807     testTsvSample(["test-e2", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedPermuteWt2Probs);
2808     testTsvSample(["test-e3", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedPermuteWt2Probs);
2809     testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs);
2810     testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs);
2811 
2812     /* Tests of subset sample (--n|num) field.
2813      *
2814      * Note: The way these tests are done ensures that subset length does not affect
2815      * output order.
2816      */
2817     import std.algorithm : min;
2818     for (size_t n = data3x6.length + 2; n >= 1; n--)
2819     {
2820         /* reservoirSamplingViaHeap.
2821          */
2822         size_t expectedLength = min(data3x6.length, n + 1);
2823         testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string,
2824                        "-H", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]);
2825 
2826         testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string,
2827                        "-H", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]);
2828 
2829         testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string,
2830                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs[0..expectedLength]);
2831 
2832         testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string,
2833                        "-H", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3[0..expectedLength]);
2834 
2835         testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string,
2836                        "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs[0..expectedLength]);
2837 
2838         testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string,
2839                        fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..expectedLength]);
2840 
2841         testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string,
2842                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..expectedLength]);
2843 
2844         testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string,
2845                        "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..expectedLength]);
2846 
2847         testTsvSample([format("test-f9_%d", n), "-s", "-n", n.to!string,
2848                        "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..expectedLength]);
2849 
2850         /* Bernoulli sampling.
2851          */
2852         import std.algorithm : min;
2853         size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedBernoulliCompatProbsP60.length);
2854 
2855         testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
2856                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60[0..sampleExpectedLength]);
2857 
2858         testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
2859                        "-H", fpath_data3x6], data3x6ExpectedBernoulliCompatP60[0..sampleExpectedLength]);
2860 
2861         testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
2862                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..sampleExpectedLength]);
2863 
2864         testTsvSample([format("test-f13_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
2865                        fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60[1..sampleExpectedLength]);
2866 
2867         /* Distinct Sampling.
2868          */
2869         size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctK1K3P60.length);
2870 
2871         testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
2872                        "-H", fpath_data3x6], data3x6ExpectedDistinctK1K3P60[0..distinctExpectedLength]);
2873 
2874         testTsvSample([format("test-f15_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
2875                        fpath_data3x6_noheader], data3x6ExpectedDistinctK1K3P60[1..distinctExpectedLength]);
2876 
2877         testTsvSample([format("test-f16_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
2878                        "-H", fpath_data3x6], data3x6ExpectedBernoulliProbsP100[0..expectedLength]);
2879 
2880         testTsvSample([format("test-f17_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
2881                        fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]);
2882     }
2883 
2884     /* Similar tests with the 1x10 data set. */
2885     for (size_t n = data1x10.length + 2; n >= 1; n--)
2886     {
2887         size_t expectedLength = min(data1x10.length, n + 1);
2888         testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string,
2889                        "-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]);
2890 
2891         testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string,
2892                        "-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]);
2893 
2894         testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string,
2895                        fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]);
2896 
2897         testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string,
2898                        "-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]);
2899     }
2900 
2901     /* Simple random sampling with replacement: ensure sample size doesn't change order. */
2902     for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--)
2903     {
2904         testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6],
2905                       data3x6ExpectedReplaceNum10[0 .. n + 1]);
2906 
2907         testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader],
2908                       data3x6ExpectedReplaceNum10[1 .. n + 1]);
2909     }
2910 
2911     /* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */
2912     for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--)
2913     {
2914         size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1);
2915 
2916         testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
2917                        "-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]);
2918 
2919         testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
2920                        fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]);
2921 }
2922 
2923 
2924     /* Distinct sampling tests. */
2925     testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25],
2926                   data5x25ExpectedDistinctK2P40);
2927 
2928     testTsvSample(["test-j2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25],
2929                   data5x25ExpectedDistinctK2K4P20);
2930 
2931     testTsvSample(["test-j3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25],
2932                   data5x25ExpectedDistinctK2K3K4P20);
2933 
2934     testTsvSample(["test-j4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader],
2935                   data5x25ExpectedDistinctK2P40[1..$]);
2936 
2937     testTsvSample(["test-j5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader],
2938                   data5x25ExpectedDistinctK2K4P20[1..$]);
2939 
2940     testTsvSample(["test-j6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader],
2941                   data5x25ExpectedDistinctK2K3K4P20[1..$]);
2942 
2943 
2944     /* These distinct tests check that the whole line as '-k 0' and specifying all fields
2945      * in order have the same result. Also that field numbers don't matter, as '-k 1,2'
2946      * in data2x25 are the same keys as '-k 2,4' in data5x25.
2947      */
2948     testTsvSample(["test-j7", "-H", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25],
2949                   data2x25ExpectedDistinctK1K2P20);
2950 
2951     testTsvSample(["test-j8", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data2x25],
2952                   data2x25ExpectedDistinctK1K2P20);
2953 
2954     testTsvSample(["test-j9", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25_noheader],
2955                   data2x25ExpectedDistinctK1K2P20[1..$]);
2956 
2957     testTsvSample(["test-j10", "-s", "-p", "0.20", "-k", "0", fpath_data2x25_noheader],
2958                   data2x25ExpectedDistinctK1K2P20[1..$]);
2959 
2960     /* Similar to the last set, but for a 1-column file. Also with random value printing. */
2961     testTsvSample(["test-j11", "-H", "-s", "-p", "0.20", "-k", "1", fpath_data1x25],
2962                   data1x25ExpectedDistinctK1P20);
2963 
2964     testTsvSample(["test-j12", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data1x25],
2965                   data1x25ExpectedDistinctK1P20);
2966 
2967     testTsvSample(["test-j13", "-s", "-p", "0.20", "-k", "1", fpath_data1x25_noheader],
2968                   data1x25ExpectedDistinctK1P20[1..$]);
2969 
2970     testTsvSample(["test-j14", "-s", "-p", "0.20", "-k", "0", fpath_data1x25_noheader],
2971                   data1x25ExpectedDistinctK1P20[1..$]);
2972 
2973 
2974     testTsvSample(["test-j15", "-H", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25],
2975                   data1x25ExpectedDistinctK1P20Probs);
2976 
2977     testTsvSample(["test-j16", "-H", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25],
2978                   data1x25ExpectedDistinctK1P20Probs);
2979 
2980     testTsvSample(["test-j17", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25_noheader],
2981                   data1x25ExpectedDistinctK1P20Probs[1..$]);
2982 
2983     testTsvSample(["test-j18", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25_noheader],
2984                   data1x25ExpectedDistinctK1P20Probs[1..$]);
2985 
2986 
2987     testTsvSample(["test-j19", "-H", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25],
2988                   data1x25ExpectedDistinctK1P20ProbsInorder);
2989 
2990     testTsvSample(["test-j20", "-H", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25],
2991                   data1x25ExpectedDistinctK1P20ProbsInorder);
2992 
2993     testTsvSample(["test-j21", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25_noheader],
2994                   data1x25ExpectedDistinctK1P20ProbsInorder[1..$]);
2995 
2996     testTsvSample(["test-j22", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25_noheader],
2997                   data1x25ExpectedDistinctK1P20ProbsInorder[1..$]);
2998 
2999 }