1 /**
2 Command line tool for randomizing or sampling lines from input streams. Several
3 sampling methods are available, including simple random sampling, weighted random
4 sampling, Bernoulli sampling, and distinct sampling.
5 
6 Copyright (c) 2017-2019, eBay Software Foundation
7 Initially written by Jon Degenhardt
8 
9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
10 */
11 module tsv_utils.tsv_sample;
12 
13 import std.range;
14 import std.stdio;
15 import std.typecons : tuple, Flag;
16 
17 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
18 
19 version(unittest)
20 {
21     // When running unit tests, use main from -main compiler switch.
22 }
23 else
24 {
25     /** Main program.
26      *
27      * Invokes command line argument processing and calls tsvSample to do the real
28      * work. Errors occurring during processing are caught and reported to the user.
29      */
30     int main(string[] cmdArgs)
31     {
32         /* When running in DMD code coverage mode, turn on report merging. */
33         version(D_Coverage) version(DigitalMars)
34         {
35             import core.runtime : dmd_coverSetMerge;
36             dmd_coverSetMerge(true);
37         }
38 
39         TsvSampleOptions cmdopt;
40         const r = cmdopt.processArgs(cmdArgs);
41         if (!r[0]) return r[1];
42         version(LDC_Profile)
43         {
44             import ldc.profile : resetAll;
45             resetAll();
46         }
47         try
48         {
49             import tsv_utils.common.utils : BufferedOutputRange;
50             auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
51 
52             tsvSample(cmdopt, bufferedOutput);
53         }
54         catch (Exception exc)
55         {
56             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
57             return 1;
58         }
59         return 0;
60     }
61 }
62 
63 immutable helpText = q"EOS
64 Synopsis: tsv-sample [options] [file...]
65 
66 Sample input lines or randomize their order. Several modes of operation
67 are available:
68 * Line order randomization (the default): All input lines are output in a
69   random order. All orderings are equally likely.
70 * Weighted line order randomization (--w|weight-field): Lines are selected
71   using weighted random sampling, with the weight taken from a field.
72   Lines are output in weighted selection order, reordering the lines.
73 * Sampling with replacement (--r|replace, --n|num): All input is read into
74   memory, then lines are repeatedly selected at random and written out. This
75   continues until --n|num samples are output. Lines can be selected multiple
76   times. Output continues forever if --n|num is zero or not specified.
77 * Bernoulli sampling (--p|prob): A random subset of lines is output based
78   on an inclusion probability. This is a streaming operation. A selection
79   decision is made on each line as is it read. Line order is not changed.
80 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled
81   based on the values in the key field. A subset of the keys are chosen
82   based on the inclusion probability (a 'distinct' set of keys). All lines
83   with one of the selected keys are output. Line order is not changed.
84 
85 The '--n|num' option limits the sample size produced. It speeds up line
86 order randomization and weighted sampling significantly. It is also used
87 to terminate sampling with replacement.
88 
89 Use '--help-verbose' for detailed information.
90 
91 Options:
92 EOS";
93 
94 immutable helpTextVerbose = q"EOS
95 Synopsis: tsv-sample [options] [file...]
96 
97 Sample input lines or randomize their order. Several modes of operation
98 are available:
99 * Line order randomization (the default): All input lines are output in a
100   random order. All orderings are equally likely.
101 * Weighted line order randomization (--w|weight-field): Lines are selected
102   using weighted random sampling, with the weight taken from a field.
103   Lines are output in weighted selection order, reordering the lines.
104 * Sampling with replacement (--r|replace, --n|num): All input is read into
105   memory, then lines are repeatedly selected at random and written out. This
106   continues until --n|num samples are output. Lines can be selected multiple
107   times. Output continues forever if --n|num is zero or not specified.
108 * Bernoulli sampling (--p|prob): A random subset of lines is output based
109   on an inclusion probability. This is a streaming operation. A selection
110   decision is made on each line as is it read. Lines order is not changed.
111 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled
112   based on the values in the key field. A subset of the keys are chosen
113   based on the inclusion probability (a 'distinct' set of keys). All lines
114   with one of the selected keys are output. Line order is not changed.
115 
116 Sample size: The '--n|num' option limits the sample size produced. This
117 speeds up line order randomization and weighted sampling significantly
118 (details below). It is also used to terminate sampling with replacement.
119 
120 Controlling the random seed: By default, each run produces a different
121 randomization or sampling. Using '--s|static-seed' changes this so
122 multiple runs produce the same results. This works by using the same
123 random seed each run. The random seed can be specified using
124 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero
125 value is a no-op and ignored.)
126 
127 Memory use: Bernoulli sampling and distinct sampling make decisions on
128 each line as it is read, so there is no memory accumulation. These
129 algorithms support arbitrary size inputs. Sampling with replacement reads
130 all lines into memory and is limited by available memory. The line order
131 randomization algorithms hold the full output set in memory prior to
132 generating results. This ultimately limits the size of the output set. For
133 these memory needs can be reduced by using a sample size (--n|num). This
134 engages reservoir sampling. Output order is not affected. Both
135 'tsv-sample -n 1000' and 'tsv-sample | head -n 1000' produce the same
136 results, but the former is quite a bit faster.
137 
138 Weighted sampling: Weighted random sampling is done using an algorithm
139 described by Pavlos Efraimidis and Paul Spirakis. Weights should be
140 positive values representing the relative weight of the entry in the
141 collection. Counts and similar can be used as weights, it is *not*
142 necessary to normalize to a [0,1] interval. Negative values are not
143 meaningful and given the value zero. Input order is not retained, instead
144 lines are output ordered by the randomized weight that was assigned. This
145 means that a smaller valid sample can be produced by taking the first N
146 lines of output. For more info on the sampling approach see:
147 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling
148 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis
149   (https://arxiv.org/abs/1012.0256)
150 
151 Printing random values: Most of the sampling algorithms work by generating
152 a random value for each line. (See "Compatibility mode" below.) The nature
153 of these values depends on the sampling algorithm. They are used for both
154 line selection and output ordering. The '--p|print-random' option can be
155 used to print these values. The random value is prepended to the line
156 separated by the --d|delimiter char (TAB by default). The
157 '--q|gen-random-inorder' option takes this one step further, generating
158 random values for all input lines without changing the input order. The
159 types of values currently used by these sampling algorithms:
160 * Unweighted sampling: Uniform random value in the interval [0,1]. This
161   includes Bernoulli sampling and unweighted line order randomization.
162 * Weighted sampling: Value in the interval [0,1]. Distribution depends on
163   the values in the weight field. It is used as a partial ordering.
164 * Distinct sampling: An integer, zero and up, representing a selection
165   group. The inclusion probability determines the number of selection groups.
166 * Sampling with replacement: Random value printing is not supported.
167 
168 The specifics behind these random values are subject to change in future
169 releases.
170 
171 Compatibility mode: As described above, many of the sampling algorithms
172 assign a random value to each line. This is useful when printing random
173 values. It has another occasionally useful property: repeated runs with
174 the same static seed but different selection parameters are more
175 compatible with each other, as each line gets assigned the same random
176 value on every run. For example, if Bernoulli sampling is run with
177 '--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed',
178 all the lines selected in the first run will be selected in the second.
179 This comes at a cost: in some cases there are faster algorithms that don't
180 preserve this property. By default, tsv-sample will use faster algorithms
181 when available. However, the '--compatibility-mode' option switches to
182 algorithms that assign a random value per line. Printing random values
183 also engages compatibility mode.
184 
185 Options:
186 EOS";
187 
188 /** Container for command line options and derived data.
189  *
190  * TsvSampleOptions handles several aspects of command line options. On the input side,
191  * it defines the command line options available, performs validation, and sets up any
192  * derived state based on the options provided. These activities are handled by the
193  * processArgs() member.
194  *
195  * Once argument processing is complete, the TsvSampleOptions is used as a container
196  * holding the specific processing options used by the different sampling routines.
197  */
198 struct TsvSampleOptions
199 {
200     string programName;                        /// Program name
201     string[] files;                            /// Input files
202     bool helpVerbose = false;                  /// --help-verbose
203     bool hasHeader = false;                    /// --H|header
204     size_t sampleSize = 0;                     /// --n|num - Size of the desired sample
205     double inclusionProbability = double.nan;  /// --p|prob - Inclusion probability
206     size_t[] keyFields;                        /// --k|key-fields - Used with inclusion probability
207     size_t weightField = 0;                    /// --w|weight-field - Field holding the weight
208     bool srsWithReplacement = false;           /// --r|replace
209     bool staticSeed = false;                   /// --s|static-seed
210     uint seedValueOptionArg = 0;               /// --v|seed-value
211     bool printRandom = false;                  /// --print-random
212     bool genRandomInorder = false;             /// --gen-random-inorder
213     string randomValueHeader = "random_value"; /// --random-value-header
214     bool compatibilityMode = false;            /// --compatibility-mode
215     char delim = '\t';                         /// --d|delimiter
216     bool versionWanted = false;                /// --V|version
217     bool preferSkipSampling = false;           /// --prefer-skip-sampling
218     bool preferAlgorithmR = false;             /// --prefer-algorithm-r
219     bool hasWeightField = false;               /// Derived.
220     bool useBernoulliSampling = false;         /// Derived.
221     bool useDistinctSampling = false;          /// Derived.
222     bool distinctKeyIsFullLine = false;        /// Derived. True if '--k|key-fields 0' is specfied.
223     bool usingUnpredictableSeed = true;        /// Derived from --static-seed, --seed-value
224     uint seed = 0;                             /// Derived from --static-seed, --seed-value
225 
226     /** Process tsv-sample command line arguments.
227      *
228      * Defines the command line options, performs validation, and derives additional
229      * state. std.getopt.getopt is called to do the main option processing followed
230      * additional validation and derivation.
231      *
232      * Help text is printed to standard output if help was requested. Error text is
233      * written to stderr if invalid input is encountered.
234      *
235      * A tuple is returned. First value is true if command line arguments were
236      * successfully processed and execution should continue, or false if an error
237      * occurred or the user asked for help. If false, the second value is the
238      * appropriate exit code (0 or 1).
239      *
240      * Returning true (execution continues) means args have been validated and derived
241      * values calculated. Field indices will have been converted to zero-based.
242      */
243     auto processArgs(ref string[] cmdArgs)
244     {
245         import std.algorithm : any, canFind, each;
246         import std.getopt;
247         import std.math : isNaN;
248         import std.path : baseName, stripExtension;
249         import std.typecons : Yes, No;
250         import tsv_utils.common.utils : makeFieldListOptionHandler;
251 
252         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
253 
254         try
255         {
256             arraySep = ",";    // Use comma to separate values in command line options
257             auto r = getopt(
258                 cmdArgs,
259                 "help-verbose",    "     Print more detailed help.", &helpVerbose,
260 
261                 std.getopt.config.caseSensitive,
262                 "H|header",        "     Treat the first line of each file as a header.", &hasHeader,
263                 std.getopt.config.caseInsensitive,
264 
265                 "n|num",           "NUM  Maximum number of lines to output. All selected lines are output if not provided or zero.", &sampleSize,
266                 "p|prob",          "NUM  Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability,
267 
268                 "k|key-fields",    "<field-list>  Fields to use as key for distinct sampling. Use with '--p|prob'. Specify '--k|key-fields 0' to use the entire line as the key.",
269                 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
270 
271                 "w|weight-field",  "NUM  Field containing weights. All lines get equal weight if not provided or zero.", &weightField,
272                 "r|replace",       "     Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement,
273                 "s|static-seed",   "     Use the same random seed every run.", &staticSeed,
274 
275                 std.getopt.config.caseSensitive,
276                 "v|seed-value",    "NUM  Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg,
277                 std.getopt.config.caseInsensitive,
278 
279                 "print-random",       "     Include the assigned random value (prepended) when writing output lines.", &printRandom,
280                 "gen-random-inorder", "     Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder,
281                 "random-value-header",  "     Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader,
282                 "compatibility-mode", "     Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode,
283 
284                 "d|delimiter",     "CHR  Field delimiter.", &delim,
285 
286                 std.getopt.config.caseSensitive,
287                 "V|version",       "     Print version information and exit.", &versionWanted,
288                 std.getopt.config.caseInsensitive,
289 
290                 "prefer-skip-sampling", "     (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.",
291                 &preferSkipSampling,
292 
293                 "prefer-algorithm-r",   "     (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.",
294                 &preferAlgorithmR,
295                 );
296 
297             if (r.helpWanted)
298             {
299                 defaultGetoptPrinter(helpText, r.options);
300                 return tuple(false, 0);
301             }
302             else if (helpVerbose)
303             {
304                 defaultGetoptPrinter(helpTextVerbose, r.options);
305                 return tuple(false, 0);
306             }
307             else if (versionWanted)
308             {
309                 import tsv_utils.common.tsvutils_version;
310                 writeln(tsvutilsVersionNotice("tsv-sample"));
311                 return tuple(false, 0);
312             }
313 
314             /* Derivations and validations. */
315             if (weightField > 0)
316             {
317                 hasWeightField = true;
318                 weightField--;    // Switch to zero-based indexes.
319             }
320 
321             if (srsWithReplacement)
322             {
323                 if (hasWeightField)
324                 {
325                     throw new Exception("Sampling with replacement (--r|replace) does not support weights (--w|weight-field).");
326                 }
327                 else if (!inclusionProbability.isNaN)
328                 {
329                     throw new Exception("Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob).");
330                 }
331                 else if (keyFields.length > 0)
332                 {
333                     throw new Exception("Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields).");
334                 }
335                 else if (printRandom || genRandomInorder)
336                 {
337                     throw new Exception("Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder).");
338                 }
339             }
340 
341             if (keyFields.length > 0)
342             {
343                 /* Note: useDistinctSampling is set as part of the inclusion probability checks below. */
344 
345                 if (inclusionProbability.isNaN) throw new Exception("--p|prob is required when using --k|key-fields.");
346 
347                 if (keyFields.length == 1 && keyFields[0] == 0)
348                 {
349                     distinctKeyIsFullLine = true;
350                 }
351                 else
352                 {
353                     if (keyFields.length > 1 && keyFields.any!(x => x == 0))
354                     {
355                         throw new Exception("Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.");
356                     }
357 
358                     keyFields.each!((ref x) => --x);  // Convert to zero-based indexing.
359                 }
360             }
361 
362             /* Inclusion probability (--p|prob) is used for both Bernoulli sampling and distinct sampling. */
363             if (!inclusionProbability.isNaN)
364             {
365                 if (inclusionProbability <= 0.0 || inclusionProbability > 1.0)
366                 {
367                     import std.format : format;
368                     throw new Exception(
369                         format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability));
370                 }
371 
372                 if (keyFields.length > 0) useDistinctSampling = true;
373                 else useBernoulliSampling = true;
374 
375                 if (hasWeightField) throw new Exception("--w|weight-field and --p|prob cannot be used together.");
376 
377                 if (genRandomInorder && !useDistinctSampling)
378                 {
379                     throw new Exception("--q|gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used.");
380                 }
381             }
382             else if (genRandomInorder && !hasWeightField)
383             {
384                 useBernoulliSampling = true;
385             }
386 
387             if (randomValueHeader.length == 0 || randomValueHeader.canFind('\n') ||
388                 randomValueHeader.canFind(delim))
389             {
390                 throw new Exception("--randomValueHeader must be at least one character and not contain field delimiters or newlines.");
391             }
392 
393             /* Random value printing implies compatibility-mode, otherwise user's selection is used. */
394             if (printRandom || genRandomInorder) compatibilityMode = true;
395 
396             /* Seed. */
397             import std.random : unpredictableSeed;
398 
399             usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0);
400 
401             if (usingUnpredictableSeed) seed = unpredictableSeed;
402             else if (seedValueOptionArg != 0) seed = seedValueOptionArg;
403             else if (staticSeed) seed = 2438424139;
404             else assert(0, "Internal error, invalid seed option states.");
405 
406             /* Assume remaining args are files. Use standard input if files were not provided. */
407             files ~= (cmdArgs.length > 1) ? cmdArgs[1..$] : ["-"];
408             cmdArgs.length = 1;
409         }
410         catch (Exception exc)
411         {
412             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
413             return tuple(false, 1);
414         }
415         return tuple(true, 0);
416     }
417 }
418 /** Invokes the appropriate sampling routine based on the command line arguments.
419  *
420  * tsvSample is the top-level routine handling the different tsv-sample use cases.
421  * Its primary role is to invoke the correct routine for type of sampling requested.
422  */
423 void tsvSample(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
424 if (isOutputRange!(OutputRange, char))
425 {
426     if (cmdopt.srsWithReplacement)
427     {
428         simpleRandomSamplingWithReplacement(cmdopt, outputStream);
429     }
430     else if (cmdopt.useBernoulliSampling)
431     {
432         bernoulliSamplingCommand(cmdopt, outputStream);
433     }
434     else if (cmdopt.useDistinctSampling)
435     {
436         if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
437         else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream);
438     }
439     else if (cmdopt.genRandomInorder)
440     {
441         /* Note that the preceding cases handle gen-random-inorder themselves (Bernoulli,
442          * Distinct), or don't handle it (SRS w/ Replacement).
443          */
444         assert(cmdopt.hasWeightField);
445         generateWeightedRandomValuesInorder(cmdopt, outputStream);
446     }
447     else if (cmdopt.sampleSize != 0)
448     {
449         reservoirSamplingCommand(cmdopt, outputStream);
450     }
451     else
452     {
453         randomizeLinesCommand(cmdopt, outputStream);
454     }
455 }
456 
457 /** Invokes the appropriate Bernoulli sampling routine based on the command line
458  * arguments.
459  *
460  * This routine selects the appropriate Bernoulli sampling function and template
461  * instantiation to use based on the command line arguments.
462  *
463  * One of the basic choices is whether to use the vanilla algorithm or skip sampling.
464  * Skip sampling is a tad faster when the inclusion probability is small but doesn't
465  * support compatibility mode. See the bernoulliSkipSampling documentation for a
466  * discussion of the skipSamplingProbabilityThreshold used here.
467  */
468 void bernoulliSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
469 if (isOutputRange!(OutputRange, char))
470 {
471     assert(!cmdopt.hasWeightField);
472 
473     immutable double skipSamplingProbabilityThreshold = 0.04;
474 
475     if (cmdopt.compatibilityMode ||
476         (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling))
477     {
478         if (cmdopt.genRandomInorder)
479         {
480             bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
481         }
482         else
483         {
484             bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream);
485         }
486     }
487     else
488     {
489         bernoulliSkipSampling(cmdopt, outputStream);
490     }
491 }
492 
493 /** Bernoulli sampling of lines from the input stream.
494  *
495  * Each input line is a assigned a random value and output if less than
496  * cmdopt.inclusionProbability. The order of the lines is not changed.
497  *
498  * This routine supports random value printing and gen-random-inorder value printing.
499  */
500 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
501     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
502 if (isOutputRange!(OutputRange, char))
503 {
504     import std.random : Random = Mt19937, uniform01;
505     import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix;
506 
507     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
508     else assert(!cmdopt.genRandomInorder);
509 
510     auto randomGenerator = Random(cmdopt.seed);
511 
512     /* Process each line. */
513     bool headerWritten = false;
514     size_t numLinesWritten = 0;
515     foreach (filename; cmdopt.files)
516     {
517         auto inputStream = (filename == "-") ? stdin : filename.File();
518         foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1))
519         {
520             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
521             if (fileLineNum == 1 && cmdopt.hasHeader)
522             {
523                 if (!headerWritten)
524                 {
525                     static if (generateRandomAll)
526                     {
527                         outputStream.put(cmdopt.randomValueHeader);
528                         outputStream.put(cmdopt.delim);
529                     }
530                     else if (cmdopt.printRandom)
531                     {
532                         outputStream.put(cmdopt.randomValueHeader);
533                         outputStream.put(cmdopt.delim);
534                     }
535 
536                     outputStream.put(line);
537                     outputStream.put("\n");
538                     headerWritten = true;
539                 }
540             }
541             else
542             {
543                 immutable double lineScore = uniform01(randomGenerator);
544 
545                 static if (generateRandomAll)
546                 {
547                     outputStream.formatRandomValue(lineScore);
548                     outputStream.put(cmdopt.delim);
549                     outputStream.put(line);
550                     outputStream.put("\n");
551 
552                     if (cmdopt.sampleSize != 0)
553                     {
554                         ++numLinesWritten;
555                         if (numLinesWritten == cmdopt.sampleSize) return;
556                     }
557                 }
558                 else if (lineScore < cmdopt.inclusionProbability)
559                 {
560                     if (cmdopt.printRandom)
561                     {
562                         outputStream.formatRandomValue(lineScore);
563                         outputStream.put(cmdopt.delim);
564                     }
565                     outputStream.put(line);
566                     outputStream.put("\n");
567 
568                     if (cmdopt.sampleSize != 0)
569                     {
570                         ++numLinesWritten;
571                         if (numLinesWritten == cmdopt.sampleSize) return;
572                     }
573                 }
574             }
575         }
576     }
577 }
578 
579 /** bernoulliSkipSampling is an implementation of Bernoulli sampling using skips.
580  *
581  * Skip sampling works by skipping a random number of lines between selections. This
582  * can be faster than assigning a random value to each line when the inclusion
583  * probability is low, as it reduces the number of calls to the random number
584  * generator. Both the random number generator and the log() function are called when
585  * calculating the next skip size. These additional log() calls add up as the
586  * inclusion probability increases.
587  *
588  * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for
589  * file-oriented line sampling. This is obviously environment specific. In the
590  * environments this implementation has been tested in the performance improvements
591  * remain small, less than 7%, even with an inclusion probability as low as 0.0001.
592  *
593  * The algorithm does not assign random values to individual lines. This makes it
594  * incompatible with random value printing. It is not suitable for compatibility mode
595  * either. As an example, in compatibility mode a line selected with '--prob 0.2' should
596  * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling
597  * does not have this property.
598  *
599  * The algorithm for calculating the skip size has been described by multiple sources.
600  * There are two key variants depending on whether the total number of lines in the
601  * data set is known in advance. (This implementation does not know the total.)
602  * Useful references:
603  * $(LIST
604  *     * Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling",
605  *       ACM Trans on Mathematical Software, 1987. On-line:
606  *       http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf
607  *     * P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book
608  *       "Data Stream Management", Springer-Verlag, 2016. On-line:
609  *       https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf
610  *     * Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line:
611  *       http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/
612  * )
613  */
614 void bernoulliSkipSampling(OutputRange)(TsvSampleOptions cmdopt, OutputRange outputStream)
615     if (isOutputRange!(OutputRange, char))
616 {
617     import std.conv : to;
618     import std.math : log, trunc;
619     import std.random : Random = Mt19937, uniform01;
620     import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix;
621 
622     assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0);
623     assert(!cmdopt.printRandom);
624     assert(!cmdopt.compatibilityMode);
625 
626     auto randomGenerator = Random(cmdopt.seed);
627 
628     immutable double discardRate = 1.0 - cmdopt.inclusionProbability;
629     immutable double logDiscardRate = log(discardRate);
630 
631     /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed
632      * interval to (0.0, 1.0], excluding 0.0.
633      */
634     size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
635 
636     /* Process each line. */
637     bool headerWritten = false;
638     size_t numLinesWritten = 0;
639     foreach (filename; cmdopt.files)
640     {
641         auto inputStream = (filename == "-") ? stdin : filename.File();
642         foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1))
643         {
644             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
645             if (fileLineNum == 1 && cmdopt.hasHeader)
646             {
647                 if (!headerWritten)
648                 {
649                     outputStream.put(line);
650                     outputStream.put("\n");
651                     headerWritten = true;
652                 }
653             }
654             else if (remainingSkips > 0)
655             {
656                 --remainingSkips;
657             }
658             else
659             {
660                 outputStream.put(line);
661                 outputStream.put("\n");
662 
663                 if (cmdopt.sampleSize != 0)
664                 {
665                     ++numLinesWritten;
666                     if (numLinesWritten == cmdopt.sampleSize) return;
667                 }
668 
669                 remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
670             }
671         }
672     }
673 }
674 
675 /** Sample a subset of lines by choosing a random set of values from key fields.
676  *
677  * Distinct sampling is a streaming form of sampling, similar to Bernoulli sampling.
678  * However, instead of each line being subject to an independent trial, lines are
679  * selected based on a key from each line. A portion of keys are randomly selected for
680  * output, and every line containing a selected key is included in the output.
681  *
682  * An example use-case is a query log having <user, query, clicked-url> triples. It is
683  * often useful to sample records for portion of the users, but including all records
684  * for the users selected. Distinct sampling supports this by selecting the subset of
685  * users included in the output.
686  *
687  * Distinct sampling is done by hashing the key and mapping the hash value into
688  * buckets matching the inclusion probability. Records having a key mapping to bucket
689  * zero are output.
690  */
691 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
692     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
693 if (isOutputRange!(OutputRange, char))
694 {
695     import std.algorithm : splitter;
696     import std.conv : to;
697     import std.digest.murmurhash;
698     import std.math : lrint;
699     import tsv_utils.common.utils : bufferedByLine, InputFieldReordering, throwIfWindowsNewlineOnUnix;
700 
701     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
702     else assert(!cmdopt.genRandomInorder);
703 
704     assert(cmdopt.keyFields.length > 0);
705     assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0);
706 
707     static if (generateRandomAll)
708     {
709         import std.format : formatValue, singleSpec;
710         immutable randomValueFormatSpec = singleSpec("%d");
711     }
712 
713     immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys.
714 
715     uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint;
716 
717     /* Create a mapping for the key fields. */
718     auto keyFieldsReordering = cmdopt.distinctKeyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields);
719 
720     /* Process each line. */
721     bool headerWritten = false;
722     size_t numLinesWritten = 0;
723     foreach (filename; cmdopt.files)
724     {
725         auto inputStream = (filename == "-") ? stdin : filename.File();
726         foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1))
727         {
728             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
729             if (fileLineNum == 1 && cmdopt.hasHeader)
730             {
731                 if (!headerWritten)
732                 {
733                     static if (generateRandomAll)
734                     {
735                         outputStream.put(cmdopt.randomValueHeader);
736                         outputStream.put(cmdopt.delim);
737                     }
738                     else if (cmdopt.printRandom)
739                     {
740                         outputStream.put(cmdopt.randomValueHeader);
741                         outputStream.put(cmdopt.delim);
742                     }
743 
744                     outputStream.put(line);
745                     outputStream.put("\n");
746                     headerWritten = true;
747                 }
748             }
749             else
750             {
751                 /* Murmurhash works by successively adding individual keys, then finalizing.
752                  * Adding individual keys is simpler if the full-line-as-key and individual
753                  * fields as keys cases are separated.
754                  */
755                 auto hasher = MurmurHash3!32(cmdopt.seed);
756 
757                 if (cmdopt.distinctKeyIsFullLine)
758                 {
759                     hasher.put(cast(ubyte[]) line);
760                 }
761                 else
762                 {
763                     assert(keyFieldsReordering !is null);
764 
765                     /* Gather the key field values and assemble the key. */
766                     keyFieldsReordering.initNewLine;
767                     foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
768                     {
769                         keyFieldsReordering.processNextField(fieldIndex, fieldValue);
770                         if (keyFieldsReordering.allFieldsFilled) break;
771                     }
772 
773                     if (!keyFieldsReordering.allFieldsFilled)
774                     {
775                         import std.format : format;
776                         throw new Exception(
777                             format("Not enough fields in line. File: %s, Line: %s",
778                                    (filename == "-") ? "Standard Input" : filename, fileLineNum));
779                     }
780 
781                     foreach (count, key; keyFieldsReordering.outputFields.enumerate)
782                     {
783                         if (count > 0) hasher.put(delimArray);
784                         hasher.put(cast(ubyte[]) key);
785                     }
786                 }
787 
788                 hasher.finish;
789 
790                 static if (generateRandomAll)
791                 {
792                     import std.conv : to;
793                     outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec);
794                     outputStream.put(cmdopt.delim);
795                     outputStream.put(line);
796                     outputStream.put("\n");
797 
798                     if (cmdopt.sampleSize != 0)
799                     {
800                         ++numLinesWritten;
801                         if (numLinesWritten == cmdopt.sampleSize) return;
802                     }
803                 }
804                 else if (hasher.get % numBuckets == 0)
805                 {
806                     if (cmdopt.printRandom)
807                     {
808                         outputStream.put('0');
809                         outputStream.put(cmdopt.delim);
810                     }
811                     outputStream.put(line);
812                     outputStream.put("\n");
813 
814                     if (cmdopt.sampleSize != 0)
815                     {
816                         ++numLinesWritten;
817                         if (numLinesWritten == cmdopt.sampleSize) return;
818                     }
819                 }
820             }
821         }
822     }
823 }
824 
825 /** Invokes the appropriate reservoir sampling routine based on the command line
826  * arguments.
827  *
828  * This routine selects the appropriate reservoir sampling function and template
829  * instantiation to use based on the command line arguments.
830  *
831  * Reservoir sampling is used when a fixed size sample is being selected from an
832  * input stream. Weighted and unweighted sampling is supported. These routines also
833  * randomize the order of the selected lines. This is consistent with line order
834  * randomization of the entire input stream (handled by randomizeLinesCommand).
835  *
836  * For unweighted sampling there is a performance tradeoff between the two available
837  * implementations. Heap-based sampling is faster for small sample sizes, Algorithm R
838  * is faster for large sample sizes. The threshold used here was chosen based on
839  * performance tests. See the reservoirSamplingAlgorithmR documentation for more
840  * information.
841  */
842 
843 void reservoirSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
844 if (isOutputRange!(OutputRange, char))
845 {
846     assert(cmdopt.sampleSize != 0);
847 
848     immutable size_t algorithmRSampleSizeThreshold = 128 * 1024;
849 
850     if (cmdopt.hasWeightField)
851     {
852         reservoirSamplingViaHeap!(Yes.isWeighted)(cmdopt, outputStream);
853     }
854     else if (cmdopt.compatibilityMode ||
855              (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR))
856     {
857         reservoirSamplingViaHeap!(No.isWeighted)(cmdopt, outputStream);
858     }
859     else
860     {
861         reservoirSamplingAlgorithmR(cmdopt, outputStream);
862     }
863 }
864 
865 /** Reservoir sampling using a heap. Both weighted and unweighted random sampling are
866  * supported.
867  *
868  * The algorithm used here is based on the one-pass algorithm described by Pavlos
869  * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S.
870  * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are
871  * simply set to one.
872  *
873  * The implementation uses a heap (priority queue) large enough to hold the desired
874  * number of lines. Input is read line-by-line, assigned a random value, and added to
875  * the heap. The role of the heap is to identify the lines with the highest assigned
876  * random values. Once the heap is full, adding a new line means dropping the line
877  * with the lowest score. A "min" heap used for this reason.
878  *
879  * When done reading all lines, the "min" heap is in the opposite order needed for
880  * output. The desired order is obtained by removing each element one at at time from
881  * the heap. The underlying data store will have the elements in correct order.
882  *
883  * Generating output in weighted order matters for several reasons:
884  *  - For weighted sampling, it preserves the property that smaller valid subsets can be
885  *    created by taking the first N lines.
886  *  - For unweighted sampling, it ensures that all output permutations are possible, and
887  *    are not influences by input order or the heap data structure used.
888  *  - Order consistency is maintained when making repeated use of the same random seed,
889  *    but with different sample sizes.
890  *
891  * There are use cases where only the selection set matters, for these some performance
892  * could be gained by skipping the reordering and simply printing the backing store
893  * array in-order, but making this distinction seems an unnecessary complication.
894  *
895  * Notes:
896  * $(LIST
897  *    * In tsv-sample versions 1.2.1 and earlier this routine also supported
898  *      randomization of all input lines. This was dropped in version 1.2.2 in favor
899  *      of the approach used in randomizeLines. The latter has significant advantages
900  *      given that all data must be read into memory.
901  *    * For large reservoir sizes better performance can be achieved using Algorithm R.
902  *      See the reservoirSamplingAlgorithmR documentation for details.
903  * )
904  */
905 void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, OutputRange)
906     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
907 if (isOutputRange!(OutputRange, char))
908 {
909     import std.container.array;
910     import std.container.binaryheap;
911     import std.random : Random = Mt19937, uniform01;
912     import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix;
913 
914     static if (isWeighted) assert(cmdopt.hasWeightField);
915     else assert(!cmdopt.hasWeightField);
916 
917     assert(cmdopt.sampleSize > 0);
918 
919     auto randomGenerator = Random(cmdopt.seed);
920 
921     struct Entry
922     {
923         double score;
924         char[] line;
925     }
926 
927     /* Create the heap and backing data store.
928      *
929      * Note: An std.container.array is used as the backing store to avoid some issues in
930      * the standard library (Phobos) binaryheap implementation. Specifically, when an
931      * std.container.array is used as backing store, the heap can efficiently reversed by
932      * removing the heap elements. This leaves the backing store in the reversed order.
933      * However, the current binaryheap implementation does not support this for all
934      * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094.
935      */
936 
937     Array!Entry dataStore;
938     dataStore.reserve(cmdopt.sampleSize);
939     auto reservoir = dataStore.heapify!("a.score > b.score")(0);  // Min binaryheap
940 
941     /* Process each line. */
942     bool headerWritten = false;
943     foreach (filename; cmdopt.files)
944     {
945         auto inputStream = (filename == "-") ? stdin : filename.File();
946         foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1))
947         {
948             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
949             if (fileLineNum == 1 && cmdopt.hasHeader)
950             {
951                 if (!headerWritten)
952                 {
953                     if (cmdopt.printRandom)
954                     {
955                         outputStream.put(cmdopt.randomValueHeader);
956                         outputStream.put(cmdopt.delim);
957                     }
958                     outputStream.put(line);
959                     outputStream.put("\n");
960                     headerWritten = true;
961                 }
962             }
963             else
964             {
965                 static if (!isWeighted)
966                 {
967                     immutable double lineScore = uniform01(randomGenerator);
968                 }
969                 else
970                 {
971                     immutable double lineWeight =
972                         getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, filename, fileLineNum);
973                     immutable double lineScore =
974                         (lineWeight > 0.0)
975                         ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
976                         : 0.0;
977                 }
978 
979                 if (reservoir.length < cmdopt.sampleSize)
980                 {
981                     reservoir.insert(Entry(lineScore, line.dup));
982                 }
983                 else if (reservoir.front.score < lineScore)
984                 {
985                     reservoir.replaceFront(Entry(lineScore, line.dup));
986                 }
987             }
988         }
989     }
990 
991     /* All entries are in the reservoir. Time to print. The heap is in reverse order
992      * of assigned weights. Reversing order is done by removing all elements from the
993      * heap, this leaves the backing store in the correct order for output.
994      *
995      * The asserts here avoid issues with the current binaryheap implementation. They
996      * detect use of backing stores having a length not synchronized to the reservoir.
997      */
998     immutable size_t numLines = reservoir.length;
999     assert(numLines == dataStore.length);
1000 
1001     while (!reservoir.empty) reservoir.removeFront;
1002     assert(numLines == dataStore.length);
1003 
1004     foreach (entry; dataStore)
1005     {
1006         if (cmdopt.printRandom)
1007         {
1008             outputStream.formatRandomValue(entry.score);
1009             outputStream.put(cmdopt.delim);
1010         }
1011         outputStream.put(entry.line);
1012         outputStream.put("\n");
1013     }
1014  }
1015 
1016 /** Generates weighted random values for all input lines, preserving input order.
1017  *
1018  * This complements weighted reservoir sampling, but instead of using a reservoir it
1019  * simply iterates over the input lines generating the values. The weighted random
1020  * values are generated with the same formula used by reservoirSampling.
1021  */
1022 void generateWeightedRandomValuesInorder(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1023 if (isOutputRange!(OutputRange, char))
1024 {
1025     import std.random : Random = Mt19937, uniform01;
1026     import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix;
1027 
1028     assert(cmdopt.hasWeightField);
1029 
1030     auto randomGenerator = Random(cmdopt.seed);
1031 
1032     /* Process each line. */
1033     bool headerWritten = false;
1034     size_t numLinesWritten = 0;
1035     foreach (filename; cmdopt.files)
1036     {
1037         auto inputStream = (filename == "-") ? stdin : filename.File();
1038         foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1))
1039         {
1040             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
1041             if (fileLineNum == 1 && cmdopt.hasHeader)
1042             {
1043                 if (!headerWritten)
1044                 {
1045                     outputStream.put(cmdopt.randomValueHeader);
1046                     outputStream.put(cmdopt.delim);
1047                     outputStream.put(line);
1048                     outputStream.put("\n");
1049                     headerWritten = true;
1050                 }
1051             }
1052             else
1053                {
1054                 immutable double lineWeight =
1055                     getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, filename, fileLineNum);
1056 
1057                 immutable double lineScore =
1058                     (lineWeight > 0.0)
1059                     ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1060                     : 0.0;
1061 
1062                 outputStream.formatRandomValue(lineScore);
1063                 outputStream.put(cmdopt.delim);
1064                 outputStream.put(line);
1065                 outputStream.put("\n");
1066 
1067                 if (cmdopt.sampleSize != 0)
1068                 {
1069                     ++numLinesWritten;
1070                     if (numLinesWritten == cmdopt.sampleSize) return;
1071                 }
1072             }
1073         }
1074     }
1075 }
1076 
1077 /** Reservoir sampling via Algorithm R
1078  *
1079  * This is an implementation of reservoir sampling using what is commonly known as
1080  * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of
1081  * Computer Programming, Volume 2: Seminumerical Algorithms". More information about
1082  * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with
1083  * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling"
1084  * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R).
1085  *
1086  * Algorithm R is used for unweighted sampling without replacement. The heap-based
1087  * algorithm in reservoirSamplingViaHeap is used for weighted sampling.
1088  *
1089  * The classic algorithm stops after identifying the selected set of items. This
1090  * implementation goes one step further and randomizes the order of the selected
1091  * lines. This supports the tsv-sample use-case, which is line order randomization.
1092  *
1093  * This algorithm is faster than reservoirSamplingViaHeap when the sample size
1094  * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size.
1095  * Insertion in this algorithm is O(1). Similarly, generating the random order in the
1096  * heap is O(k * log k), while in this algorithm the final randomization step is O(k).
1097  *
1098  * This speed advantage may be offset a certain amount by using a more expensive random
1099  * value generator. reservoirSamplingViaHeap generates values between zero and one,
1100  * whereas reservoirSamplingAlgorithmR generates random integers over and ever growing
1101  * interval. The latter is expected to be more expensive. This is consistent with
1102  * performance tests indicating that reservoirSamplingViaHeap is faster when using
1103  * small-to-medium size reservoirs and large input streams.
1104  */
1105 void reservoirSamplingAlgorithmR(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1106 if (isOutputRange!(OutputRange, char))
1107 {
1108     import std.random : Random = Mt19937, randomShuffle, uniform;
1109     import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix;
1110 
1111     assert(cmdopt.sampleSize > 0);
1112     assert(!cmdopt.hasWeightField);
1113     assert(!cmdopt.compatibilityMode);
1114     assert(!cmdopt.printRandom);
1115     assert(!cmdopt.genRandomInorder);
1116 
1117     string[] reservoir;
1118     auto reservoirAppender = appender(&reservoir);
1119     reservoirAppender.reserve(cmdopt.sampleSize);
1120 
1121     auto randomGenerator = Random(cmdopt.seed);
1122 
1123     /* Process each line. */
1124 
1125     bool headerWritten = false;
1126     size_t totalLineNum = 0;
1127     foreach (filename; cmdopt.files)
1128     {
1129         auto inputStream = (filename == "-") ? stdin : filename.File();
1130         foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1))
1131         {
1132             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
1133             if (fileLineNum == 1 && cmdopt.hasHeader)
1134             {
1135                 if (!headerWritten)
1136                 {
1137                     outputStream.put(line);
1138                     outputStream.put("\n");
1139                     headerWritten = true;
1140                 }
1141             }
1142             else
1143             {
1144                 /* Add lines to the reservoir until the reservoir is filled.
1145                  * After that lines are added with decreasing likelihood, based on
1146                  * the total number of lines seen. If added to the reservoir, the
1147                  * line replaces a randomly chosen existing line.
1148                  */
1149                 if (totalLineNum < cmdopt.sampleSize)
1150                 {
1151                     reservoirAppender ~= line.idup;
1152                 }
1153                 else
1154                 {
1155                     immutable size_t i = uniform(0, totalLineNum, randomGenerator);
1156                     if (i < reservoir.length) reservoir[i] = line.idup;
1157                 }
1158 
1159                 ++totalLineNum;
1160             }
1161         }
1162     }
1163 
1164     /* The random sample is now in the reservoir. Shuffle it and print. */
1165 
1166     reservoir.randomShuffle(randomGenerator);
1167 
1168     foreach (ref line; reservoir)
1169     {
1170         outputStream.put(line);
1171         outputStream.put("\n");
1172     }
1173 }
1174 
1175 /** This routine is invoked when all input lines are being randomized. It selects the
1176  * appropriate function and template instantiation based on the command line arguments.
1177  *
1178  * Different randomization algorithms are used when all input lines are being randomized
1179  * rather than a subset. The key distinction being that if all input needs to be read
1180  * into memory to support the algorithm, it works better to simply read the data all at
1181  * once.
1182  *
1183  * There are two different types of algorithms used. Array shuffling is used for
1184  * unweighted randomization. Sorting is used for weighted randomization or when
1185  * compatibility mode is needed.
1186  *
1187  * The algorithms used here are all limited by available memory.
1188  */
1189 void randomizeLinesCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1190 if (isOutputRange!(OutputRange, char))
1191 {
1192     if (cmdopt.hasWeightField)
1193     {
1194         randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream);
1195     }
1196     else if (cmdopt.compatibilityMode)
1197     {
1198         randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream);
1199     }
1200     else
1201     {
1202         randomizeLinesViaShuffle(cmdopt, outputStream);
1203     }
1204 }
1205 
1206 /** Randomize all the lines in files or standard input using assigned random weights
1207  * and sorting.
1208  *
1209  * All lines in files and/or standard input are read in and written out in random
1210  * order. This algorithm assigns a random value to each line and sorts. This approach
1211  * supports both weighted sampling and simple random sampling (unweighted).
1212  *
1213  * This is significantly faster than heap-based reservoir sampling in the case where
1214  * the entire file is being read. See also randomizeLinesViaShuffle for the unweighted
1215  * case, as it is a little faster, at the cost not supporting random value printing or
1216  * compatibility-mode.
1217  *
1218  * Input data size is limited by available memory. Disk oriented techniques are needed
1219  * when data sizes are larger. For example, generating random values line-by-line (ala
1220  * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort.
1221  */
1222 void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1223 if (isOutputRange!(OutputRange, char))
1224 {
1225     import std.algorithm : map, sort;
1226 
1227     static if (isWeighted) assert(cmdopt.hasWeightField);
1228     else assert(!cmdopt.hasWeightField);
1229 
1230     assert(cmdopt.sampleSize == 0);
1231 
1232     /*
1233      * Read all file data into memory. Then split the data into lines and assign a
1234      * random value to each line. identifyFileLines also writes the first header line.
1235      */
1236     const fileData = cmdopt.files.map!FileData.array;
1237     auto inputLines = fileData.identifyFileLines!(Yes.hasRandomValue, isWeighted)(cmdopt, outputStream);
1238 
1239     /*
1240      * Sort by the weight and output the lines.
1241      */
1242     inputLines.sort!((a, b) => a.randomValue > b.randomValue);
1243 
1244     foreach (lineEntry; inputLines)
1245     {
1246         if (cmdopt.printRandom)
1247         {
1248             outputStream.formatRandomValue(lineEntry.randomValue);
1249             outputStream.put(cmdopt.delim);
1250         }
1251         outputStream.put(lineEntry.data);
1252         outputStream.put("\n");
1253     }
1254 }
1255 
1256 /** Randomize all the lines in files or standard input using a shuffling algorithm.
1257  *
1258  * All lines in files and/or standard input are read in and written out in random
1259  * order. This routine uses array shuffling, which is faster than sorting. It is a
1260  * good alternative to randomizeLinesViaSort when doing unweighted randomization.
1261  *
1262  * Input data size is limited by available memory. Disk oriented techniques are needed
1263  * when data sizes are larger. For example, generating random values line-by-line (ala
1264  * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort.
1265  *
1266  * This routine does not support random value printing or compatibility-mode.
1267  */
1268 void randomizeLinesViaShuffle(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1269 if (isOutputRange!(OutputRange, char))
1270 {
1271     import std.algorithm : map;
1272     import std.random : Random = Mt19937, randomShuffle;
1273 
1274     assert(cmdopt.sampleSize == 0);
1275     assert(!cmdopt.hasWeightField);
1276     assert(!cmdopt.printRandom);
1277     assert(!cmdopt.genRandomInorder);
1278 
1279     /*
1280      * Read all file data into memory and split into lines.
1281      */
1282     const fileData = cmdopt.files.map!FileData.array;
1283     auto inputLines = fileData.identifyFileLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream);
1284 
1285     /*
1286      * Randomly shuffle and print each line.
1287      *
1288      * Note: Also tried randomCover, but that was exceedingly slow.
1289      */
1290     import std.random : randomShuffle;
1291 
1292     auto randomGenerator = Random(cmdopt.seed);
1293     inputLines.randomShuffle(randomGenerator);
1294 
1295     foreach (ref line; inputLines)
1296     {
1297         outputStream.put(line.data);
1298         outputStream.put("\n");
1299     }
1300 }
1301 
1302 /** Simple random sampling with replacement.
1303  *
1304  * All lines in files and/or standard input are read in. Then random lines are selected
1305  * one at a time and output. Lines can be selected multiple times. This process continues
1306  * until the desired number of samples (--n|num) has been output. Output continues
1307  * indefinitely if a sample size was not provided.
1308  */
1309 void simpleRandomSamplingWithReplacement(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1310 if (isOutputRange!(OutputRange, char))
1311 {
1312     import std.algorithm : map;
1313     import std.random : Random = Mt19937, uniform;
1314 
1315     /*
1316      * Read all file data into memory and split the data into lines.
1317      */
1318     const fileData = cmdopt.files.map!FileData.array;
1319     const inputLines = fileData.identifyFileLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream);
1320 
1321     if (inputLines.length > 0)
1322     {
1323         auto randomGenerator = Random(cmdopt.seed);
1324 
1325         /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */
1326         size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize;
1327         while (numLeft != 0)
1328         {
1329             immutable size_t index = uniform(0, inputLines.length, randomGenerator);
1330             outputStream.put(inputLines[index].data);
1331             outputStream.put("\n");
1332             if (cmdopt.sampleSize != 0) numLeft--;
1333         }
1334     }
1335 }
1336 
1337 /** A container and reader of data from a file or standard input.
1338  *
1339  * The FileData struct is used to read data from a file or standard input. It is used
1340  * by passing a filename to the constructor. The constructor reads the file data.
1341  * If the filename is a single hyphen ('-') then data is read from standard input.
1342  *
1343  * The struct make the data available through two members: 'filename', which is the
1344  * filename, and 'data', which is a character array of the data.
1345  */
1346 struct FileData
1347 {
1348     string filename;
1349     char[] data;
1350 
1351     this(string fname)
1352     {
1353         import std.algorithm : min;
1354         import std.array : appender;
1355 
1356         filename = fname;
1357 
1358         ubyte[1024 * 128] fileRawBuf;
1359         auto dataAppender = appender(&data);
1360         auto ifile = (filename == "-") ? stdin : filename.File;
1361 
1362         if (filename != "-")
1363         {
1364             immutable ulong filesize = ifile.size;
1365             if (filesize < ulong.max) dataAppender.reserve(min(filesize, size_t.max));
1366         }
1367 
1368         foreach (ref ubyte[] buffer; ifile.byChunk(fileRawBuf)) dataAppender.put(cast(char[]) buffer);
1369     }
1370 }
1371 
1372 /** HasRandomValue is a boolean flag used at compile time by identifyFileLines to
1373  * distinguish use cases needing random value assignments from those that don't.
1374  */
1375 alias HasRandomValue = Flag!"hasRandomValue";
1376 
1377 /** An InputLine array is returned by identifyFileLines to represent each non-header line
1378  * line found in a FileData array. The 'data' element contains the line. A 'randomValue'
1379  * line is included if random values are being generated.
1380  */
1381 struct InputLine(HasRandomValue hasRandomValue)
1382 {
1383     const(char)[] data;
1384     static if (hasRandomValue) double randomValue;
1385 }
1386 
1387 /** identifyFileLines is used by algorithms that read all files into memory prior to
1388  * processing. It does the initial processing of the file data.
1389  *
1390  * Three primary tasks are performed. One is splitting all input data into lines. The
1391  * second is writing the header line from the first file to the output stream. Header
1392  * lines from subsequent files are ignored. Third is assigning a random value to the
1393  * line, if random values are being generated.
1394  *
1395  * The key input is a FileData array, one element for each file. The FileData reads
1396  * the file when instantiated.
1397  *
1398  * The return value is an array of InputLine structs. The struct will have a 'randomValue'
1399  * member if random values are being assigned.
1400  */
1401 InputLine!hasRandomValue[] identifyFileLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted, OutputRange)
1402 (const ref FileData[] fileData, TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1403 if (isOutputRange!(OutputRange, char))
1404 {
1405     import std.algorithm : splitter;
1406     import std.array : appender;
1407     import std.random : Random = Mt19937, uniform01;
1408     import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
1409 
1410     static assert(hasRandomValue || !isWeighted);
1411     static if(!hasRandomValue) assert(!cmdopt.printRandom);
1412 
1413     InputLine!hasRandomValue[] inputLines;
1414 
1415     auto linesAppender = appender(&inputLines);
1416     static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed);
1417     bool headerWritten = false;
1418 
1419     foreach (fd; fileData)
1420     {
1421         /* Drop the last newline to avoid adding an extra empty line. */
1422         const data = (fd.data.length > 0 && fd.data[$ - 1] == '\n') ? fd.data[0 .. $ - 1] : fd.data;
1423         foreach (fileLineNum, ref line; data.splitter('\n').enumerate(1))
1424         {
1425             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, fd.filename, fileLineNum);
1426             if (fileLineNum == 1 && cmdopt.hasHeader)
1427             {
1428                 if (!headerWritten)
1429                 {
1430                     if (cmdopt.printRandom)
1431                     {
1432                         outputStream.put(cmdopt.randomValueHeader);
1433                         outputStream.put(cmdopt.delim);
1434                     }
1435                     outputStream.put(line);
1436                     outputStream.put("\n");
1437                     headerWritten = true;
1438                 }
1439             }
1440             else
1441             {
1442                 static if (!hasRandomValue)
1443                 {
1444                     linesAppender.put(InputLine!hasRandomValue(line));
1445                 }
1446                 else
1447                 {
1448                     static if (!isWeighted)
1449                     {
1450                         immutable double randomValue = uniform01(randomGenerator);
1451                     }
1452                     else
1453                     {
1454                         immutable double lineWeight =
1455                             getFieldValue!double(line, cmdopt.weightField, cmdopt.delim,
1456                                                  fd.filename, fileLineNum);
1457                         immutable double randomValue =
1458                             (lineWeight > 0.0)
1459                             ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1460                             : 0.0;
1461                     }
1462 
1463                     linesAppender.put(InputLine!hasRandomValue(line, randomValue));
1464                 }
1465             }
1466         }
1467     }
1468 
1469     return inputLines;
1470 }
1471 
1472 /** Write a floating point random value to an output stream.
1473  *
1474  * This routine is used for floating point random value printing. This routine writes
1475  * 17 significant digits, the range available in doubles. This routine prefers decimal
1476  * format, without exponents. It will generate somewhat large precision numbers,
1477  * currently up to 28 digits, before switching to exponents.
1478  *
1479  * The primary reason for this approach is to enable faster sorting on random values
1480  * by GNU sort and similar external sorting programs. GNU sort is dramatically faster
1481  * on decimal format numeric sorts ('n' switch) than general numeric sorts ('g' switch).
1482  * The 'general numeric' handles exponential notation. The difference is 5-10x.
1483  *
1484  * Random values generated by Bernoulli sampling are nearly always greater than 1e-12.
1485  * No examples less than 1e-09 were seen in hundred of millions of trials. Similar
1486  * results were seen with weighted sampling with integer weights. The same is not true
1487  * with floating point weights. These produce quite large exponents. However, even
1488  * for floating point weights this can be useful. For random weights [0,1] less than 5%
1489  * will be less than 1e-12 and use exponential notation.
1490  */
1491 void formatRandomValue(OutputRange)(auto ref OutputRange outputStream, double value)
1492 if (isOutputRange!(OutputRange, char))
1493 {
1494     import std.format : formatValue, singleSpec;
1495 
1496     immutable spec17f = singleSpec("%.17f");
1497     immutable spec18f = singleSpec("%.18f");
1498     immutable spec19f = singleSpec("%.19f");
1499     immutable spec20f = singleSpec("%.20f");
1500     immutable spec21f = singleSpec("%.21f");
1501     immutable spec22f = singleSpec("%.22f");
1502     immutable spec23f = singleSpec("%.23f");
1503     immutable spec24f = singleSpec("%.24f");
1504     immutable spec25f = singleSpec("%.25f");
1505     immutable spec26f = singleSpec("%.26f");
1506     immutable spec27f = singleSpec("%.27f");
1507     immutable spec28f = singleSpec("%.28f");
1508 
1509     immutable spec17g = singleSpec("%.17g");
1510 
1511     immutable formatSpec =
1512         (value >= 1e-01) ? spec17f :
1513         (value >= 1e-02) ? spec18f :
1514         (value >= 1e-03) ? spec19f :
1515         (value >= 1e-04) ? spec20f :
1516         (value >= 1e-05) ? spec21f :
1517         (value >= 1e-06) ? spec22f :
1518         (value >= 1e-07) ? spec23f :
1519         (value >= 1e-08) ? spec24f :
1520         (value >= 1e-09) ? spec25f :
1521         (value >= 1e-10) ? spec26f :
1522         (value >= 1e-11) ? spec27f :
1523         (value >= 1e-12) ? spec28f : spec17g;
1524 
1525     outputStream.formatValue(value, formatSpec);
1526 }
1527 
1528 unittest
1529 {
1530     void testFormatValue(double value, string expected)
1531     {
1532         import std.array : appender;
1533         import std.format : format;
1534 
1535         auto s = appender!string();
1536         s.formatRandomValue(value);
1537         assert(s.data == expected,
1538                format("[testFormatValue] value: %g; expected: %s; actual: %s", value, expected, s.data));
1539     }
1540 
1541     testFormatValue(1.0,   "1.00000000000000000");
1542     testFormatValue(0.1,   "0.10000000000000001");
1543     testFormatValue(0.01,  "0.010000000000000000");
1544     testFormatValue(1e-03, "0.0010000000000000000");
1545     testFormatValue(1e-04, "0.00010000000000000000");
1546     testFormatValue(1e-05, "0.000010000000000000001");
1547     testFormatValue(1e-06, "0.0000010000000000000000");
1548     testFormatValue(1e-07, "0.00000010000000000000000");
1549     testFormatValue(1e-08, "0.000000010000000000000000");
1550     testFormatValue(1e-09, "0.0000000010000000000000001");
1551     testFormatValue(1e-10, "0.00000000010000000000000000");
1552     testFormatValue(1e-11, "0.000000000009999999999999999");
1553     testFormatValue(1e-12, "0.0000000000010000000000000000");
1554     testFormatValue(1e-13, "1e-13");
1555     testFormatValue(1e-14, "1e-14");
1556     testFormatValue(12345678901234567e-15, "12.34567890123456735");
1557     testFormatValue(12345678901234567e-16, "1.23456789012345669");
1558     testFormatValue(12345678901234567e-17, "0.12345678901234566");
1559     testFormatValue(12345678901234567e-18, "0.012345678901234567");
1560     testFormatValue(12345678901234567e-19, "0.0012345678901234567");
1561     testFormatValue(12345678901234567e-20, "0.00012345678901234567");
1562     testFormatValue(12345678901234567e-21, "0.000012345678901234568");
1563     testFormatValue(12345678901234567e-22, "0.0000012345678901234567");
1564     testFormatValue(12345678901234567e-23, "0.00000012345678901234566");
1565     testFormatValue(12345678901234567e-24, "0.000000012345678901234567");
1566     testFormatValue(12345678901234567e-25, "0.0000000012345678901234566");
1567     testFormatValue(12345678901234567e-26, "0.00000000012345678901234568");
1568     testFormatValue(12345678901234567e-27, "0.000000000012345678901234567");
1569     testFormatValue(12345678901234567e-28, "0.0000000000012345678901234567");
1570     testFormatValue(12345678901234567e-29, "1.2345678901234566e-13");
1571 }
1572 
1573 
1574 /** Convenience function for extracting a single field from a line. See
1575  * [tsv_utils.common.utils.getTsvFieldValue] for details. This wrapper creates error
1576  * text tailored for this program.
1577  */
1578 import std.traits : isSomeChar;
1579 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, size_t lineNum) pure @safe
1580 if (isSomeChar!C)
1581 {
1582     import std.conv : ConvException, to;
1583     import std.format : format;
1584     import tsv_utils.common.utils : getTsvFieldValue;
1585 
1586     T val;
1587     try
1588     {
1589         val = getTsvFieldValue!T(line, fieldIndex, delim);
1590     }
1591     catch (ConvException exc)
1592     {
1593         throw new Exception(
1594             format("Could not process line: %s\n  File: %s Line: %s%s",
1595                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
1596                    (lineNum == 1) ? "\n  Is this a header line? Use --H|header to skip." : ""));
1597     }
1598     catch (Exception exc)
1599     {
1600         /* Not enough fields on the line. */
1601         throw new Exception(
1602             format("Could not process line: %s\n  File: %s Line: %s",
1603                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum));
1604     }
1605 
1606     return val;
1607 }
1608 
1609 unittest
1610 {
1611     /* getFieldValue unit tests. getTsvFieldValue has it's own tests.
1612      * These tests make basic sanity checks on the getFieldValue wrapper.
1613      */
1614     import std.exception;
1615 
1616     assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123);
1617     assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4);
1618     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1));
1619     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2));
1620     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1));
1621     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2));
1622 }
1623 
1624 /* Unit tests for the main program start here.
1625  *
1626  * Portability note: Many of the tests here rely on generating consistent random numbers
1627  * across different platforms when using the same random seed. So far this has succeeded
1628  * on several different platform, compiler, and library versions. However, it is certainly
1629  * possible this condition will not hold on other platforms.
1630  *
1631  * For tsv-sample, this portability implies generating the same results on different
1632  * platforms when using the same random seed. This is NOT part of tsv-sample guarantees,
1633  * but it is convenient for testing. If platforms are identified that do not generate
1634  * the same results these tests will need to be adjusted.
1635  */
1636 version(unittest)
1637 {
1638     /* Unit test helper functions. */
1639 
1640     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
1641     import std.conv : to;
1642 
1643     void testTsvSample(string[] cmdArgs, string[][] expected)
1644     {
1645         import std.array : appender;
1646         import std.format : format;
1647 
1648         assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty.");
1649 
1650         auto formatAssertMessage(T...)(string msg, T formatArgs)
1651         {
1652             auto formatString = "[testTsvSample] %s: " ~ msg;
1653             return format(formatString, cmdArgs[0], formatArgs);
1654         }
1655 
1656         TsvSampleOptions cmdopt;
1657         auto savedCmdArgs = cmdArgs.to!string;
1658         auto r = cmdopt.processArgs(cmdArgs);
1659         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
1660         auto output = appender!(char[])();
1661 
1662         tsvSample(cmdopt, output);    // This invokes the main code line.
1663 
1664         auto expectedOutput = expected.tsvDataToString;
1665 
1666         assert(output.data == expectedOutput,
1667                formatAssertMessage(
1668                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
1669                    expectedOutput.to!string, output.data.to!string));
1670     }
1671  }
1672 
1673 unittest
1674 {
1675     import std.path : buildPath;
1676     import std.file : rmdirRecurse;
1677     import std.format : format;
1678 
1679     auto testDir = makeUnittestTempDir("tsv_sample");
1680     scope(exit) testDir.rmdirRecurse;
1681 
1682     /* Tabular data sets and expected results use the built-in static seed.
1683      * Tests are run by writing the data set to a file, then calling the main
1684      * routine to process. The function testTsvSample plays the role of the
1685      * main program. Rather than writing to expected output, the results are
1686      * matched against expected. The expected results were verified by hand
1687      * prior to inclusion in the test.
1688      *
1689      * The initial part of this section is simply setting up data files and
1690      * expected results.
1691      *
1692      * Expected results naming conventions:
1693      *  - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected
1694      *  - Sampling Type (required): Permute, Replace, Bernoulli, Distinct
1695      *  - Compatibility: Compat, AlgoR, Skip, Swap
1696      *  - Weight Field: Wt<num>, e.g. Wt3
1697      *  - Sample Size: Num<num>, eg. Num3
1698      *  - Seed Value: V<num>, eg. V77
1699      *  - Key Field: K<num>, e.g. K2
1700      *  - Probability: P<num>, e.g P05 (5%)
1701      *  - Printing Probabilities: Probs
1702      *  - Printing Probs in order: ProbsInorder
1703      *  - Printing Probs with custom header: RVCustom
1704      */
1705 
1706     /* Empty file. */
1707     string[][] dataEmpty = [];
1708     string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv");
1709     writeUnittestTsvFile(fpath_dataEmpty, dataEmpty);
1710 
1711     /* 3x1, header only. */
1712     string[][] data3x0 = [["field_a", "field_b", "field_c"]];
1713     string fpath_data3x0 = buildPath(testDir, "data3x0.tsv");
1714     writeUnittestTsvFile(fpath_data3x0, data3x0);
1715 
1716     /* 3x1 */
1717     string[][] data3x1 =
1718         [["field_a", "field_b", "field_c"],
1719          ["tan", "タン", "8.5"]];
1720 
1721     string fpath_data3x1 = buildPath(testDir, "data3x1.tsv");
1722     string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv");
1723     writeUnittestTsvFile(fpath_data3x1, data3x1);
1724     writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1..$]);
1725 
1726     string[][] data3x1ExpectedReplaceNum3 =
1727         [["field_a", "field_b", "field_c"],
1728          ["tan", "タン", "8.5"],
1729          ["tan", "タン", "8.5"],
1730          ["tan", "タン", "8.5"]];
1731 
1732     /* 3x2 */
1733     string[][] data3x2 =
1734         [["field_a", "field_b", "field_c"],
1735          ["brown", "褐色", "29.2"],
1736          ["gray", "グレー", "6.2"]];
1737 
1738     string fpath_data3x2 = buildPath(testDir, "data3x2.tsv");
1739     string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv");
1740     writeUnittestTsvFile(fpath_data3x2, data3x2);
1741     writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1..$]);
1742 
1743     string[][] data3x2PermuteCompat =
1744         [["field_a", "field_b", "field_c"],
1745          ["gray", "グレー", "6.2"],
1746          ["brown", "褐色", "29.2"]];
1747 
1748     string[][] data3x2PermuteShuffle =
1749         [["field_a", "field_b", "field_c"],
1750          ["gray", "グレー", "6.2"],
1751          ["brown", "褐色", "29.2"]];
1752 
1753     /* 3x3 */
1754     string[][] data3x3 =
1755         [["field_a", "field_b", "field_c"],
1756          ["orange", "オレンジ", "2.5"],
1757          ["pink", "ピンク", "1.1"],
1758          ["purple", "紫の", "42"]];
1759 
1760     string fpath_data3x3 = buildPath(testDir, "data3x3.tsv");
1761     string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv");
1762     writeUnittestTsvFile(fpath_data3x3, data3x3);
1763     writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1..$]);
1764 
1765     string[][] data3x3ExpectedPermuteCompat =
1766         [["field_a", "field_b", "field_c"],
1767          ["purple", "紫の", "42"],
1768          ["pink", "ピンク", "1.1"],
1769          ["orange", "オレンジ", "2.5"]];
1770 
1771     string[][] data3x3ExpectedPermuteSwap =
1772         [["field_a", "field_b", "field_c"],
1773          ["purple", "紫の", "42"],
1774          ["orange", "オレンジ", "2.5"],
1775          ["pink", "ピンク", "1.1"]];
1776 
1777     /* 3x6 */
1778     string[][] data3x6 =
1779         [["field_a", "field_b", "field_c"],
1780          ["red", "赤", "23.8"],
1781          ["green", "緑", "0.0072"],
1782          ["white", "白", "1.65"],
1783          ["yellow", "黄", "12"],
1784          ["blue", "青", "12"],
1785          ["black", "黒", "0.983"]];
1786     string fpath_data3x6 = buildPath(testDir, "data3x6.tsv");
1787     string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv");
1788     writeUnittestTsvFile(fpath_data3x6, data3x6);
1789     writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1..$]);
1790 
1791     // Randomization, all lines
1792     string[][] data3x6ExpectedPermuteCompat =
1793         [["field_a", "field_b", "field_c"],
1794          ["yellow", "黄", "12"],
1795          ["black", "黒", "0.983"],
1796          ["blue", "青", "12"],
1797          ["white", "白", "1.65"],
1798          ["green", "緑", "0.0072"],
1799          ["red", "赤", "23.8"]];
1800 
1801     string[][] data3x6ExpectedPermuteSwap =
1802         [["field_a", "field_b", "field_c"],
1803          ["black", "黒", "0.983"],
1804          ["green", "緑", "0.0072"],
1805          ["red", "赤", "23.8"],
1806          ["yellow", "黄", "12"],
1807          ["white", "白", "1.65"],
1808          ["blue", "青", "12"]];
1809 
1810     string[][] data3x6ExpectedPermuteCompatProbs =
1811         [["random_value", "field_a", "field_b", "field_c"],
1812          ["0.96055546286515892", "yellow", "黄", "12"],
1813          ["0.75710153928957880", "black", "黒", "0.983"],
1814          ["0.52525980887003243", "blue", "青", "12"],
1815          ["0.49287854949943721", "white", "白", "1.65"],
1816          ["0.15929344086907804", "green", "緑", "0.0072"],
1817          ["0.010968807619065046", "red", "赤", "23.8"]];
1818 
1819     /* Note: data3x6ExpectedAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because
1820      * both are effectively the same algorithm given that --num is data length. Both read
1821      * in the full data in order then call randomShuffle.
1822      */
1823     string[][] data3x6ExpectedPermuteAlgoRNum6 =
1824         [["field_a", "field_b", "field_c"],
1825          ["black", "黒", "0.983"],
1826          ["green", "緑", "0.0072"],
1827          ["red", "赤", "23.8"],
1828          ["yellow", "黄", "12"],
1829          ["white", "白", "1.65"],
1830          ["blue", "青", "12"]];
1831 
1832     string[][] data3x6ExpectedPermuteAlgoRNum5 =
1833         [["field_a", "field_b", "field_c"],
1834          ["red", "赤", "23.8"],
1835          ["black", "黒", "0.983"],
1836          ["white", "白", "1.65"],
1837          ["green", "緑", "0.0072"],
1838          ["yellow", "黄", "12"]];
1839 
1840     string[][] data3x6ExpectedPermuteAlgoRNum4 =
1841         [["field_a", "field_b", "field_c"],
1842          ["blue", "青", "12"],
1843          ["green", "緑", "0.0072"],
1844          ["black", "黒", "0.983"],
1845          ["white", "白", "1.65"]];
1846 
1847     string[][] data3x6ExpectedPermuteAlgoRNum3 =
1848         [["field_a", "field_b", "field_c"],
1849          ["red", "赤", "23.8"],
1850          ["black", "黒", "0.983"],
1851          ["green", "緑", "0.0072"]];
1852 
1853     string[][] data3x6ExpectedPermuteAlgoRNum2 =
1854         [["field_a", "field_b", "field_c"],
1855          ["black", "黒", "0.983"],
1856          ["red", "赤", "23.8"]];
1857 
1858     string[][] data3x6ExpectedPermuteAlgoRNum1 =
1859         [["field_a", "field_b", "field_c"],
1860          ["green", "緑", "0.0072"]];
1861 
1862     string[][] data3x6ExpectedBernoulliProbsP100 =
1863         [["random_value", "field_a", "field_b", "field_c"],
1864          ["0.010968807619065046", "red", "赤", "23.8"],
1865          ["0.15929344086907804", "green", "緑", "0.0072"],
1866          ["0.49287854949943721", "white", "白", "1.65"],
1867          ["0.96055546286515892", "yellow", "黄", "12"],
1868          ["0.52525980887003243", "blue", "青", "12"],
1869          ["0.75710153928957880", "black", "黒", "0.983"]];
1870 
1871     string[][] data3x6ExpectedBernoulliCompatProbsP60 =
1872         [["random_value", "field_a", "field_b", "field_c"],
1873          ["0.010968807619065046", "red", "赤", "23.8"],
1874          ["0.15929344086907804", "green", "緑", "0.0072"],
1875          ["0.49287854949943721", "white", "白", "1.65"],
1876          ["0.52525980887003243", "blue", "青", "12"]];
1877 
1878     string[][] data3x6ExpectedBernoulliSkipP40 =
1879         [["field_a", "field_b", "field_c"],
1880          ["red", "赤", "23.8"],
1881          ["green", "緑", "0.0072"],
1882          ["yellow", "黄", "12"]];
1883 
1884     string[][] data3x6ExpectedBernoulliCompatP60 =
1885         [["field_a", "field_b", "field_c"],
1886          ["red", "赤", "23.8"],
1887          ["green", "緑", "0.0072"],
1888          ["white", "白", "1.65"],
1889          ["blue", "青", "12"]];
1890 
1891     string[][] data3x6ExpectedDistinctK1K3P60 =
1892         [["field_a", "field_b", "field_c"],
1893          ["green", "緑", "0.0072"],
1894          ["white", "白", "1.65"],
1895          ["blue", "青", "12"]];
1896 
1897     string[][] data3x6ExpectedDistinctK1K3P60Probs =
1898         [["random_value", "field_a", "field_b", "field_c"],
1899          ["0", "green", "緑", "0.0072"],
1900          ["0", "white", "白", "1.65"],
1901          ["0", "blue", "青", "12"]];
1902 
1903     string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom =
1904         [["custom_random_value_header", "field_a", "field_b", "field_c"],
1905          ["0", "green", "緑", "0.0072"],
1906          ["0", "white", "白", "1.65"],
1907          ["0", "blue", "青", "12"]];
1908 
1909     string[][] data3x6ExpectedDistinctK2P2ProbsInorder =
1910         [["random_value", "field_a", "field_b", "field_c"],
1911          ["1", "red", "赤", "23.8"],
1912          ["0", "green", "緑", "0.0072"],
1913          ["0", "white", "白", "1.65"],
1914          ["1", "yellow", "黄", "12"],
1915          ["3", "blue", "青", "12"],
1916          ["2", "black", "黒", "0.983"]];
1917 
1918     string[][] data3x6ExpectedPermuteWt3Probs =
1919         [["random_value", "field_a", "field_b", "field_c"],
1920          ["0.99665198757645390", "yellow", "黄", "12"],
1921          ["0.94775884809836686", "blue", "青", "12"],
1922          ["0.82728234682286661", "red", "赤", "23.8"],
1923          ["0.75346697377181959", "black", "黒", "0.983"],
1924          ["0.65130103496422487", "white", "白", "1.65"],
1925          ["1.5636943712879866e-111", "green", "緑", "0.0072"]];
1926 
1927     string[][] data3x6ExpectedWt3ProbsInorder =
1928         [["random_value", "field_a", "field_b", "field_c"],
1929          ["0.82728234682286661", "red", "赤", "23.8"],
1930          ["1.5636943712879866e-111", "green", "緑", "0.0072"],
1931          ["0.65130103496422487", "white", "白", "1.65"],
1932          ["0.99665198757645390", "yellow", "黄", "12"],
1933          ["0.94775884809836686", "blue", "青", "12"],
1934          ["0.75346697377181959", "black", "黒", "0.983"]];
1935 
1936     string[][] data3x6ExpectedPermuteWt3 =
1937         [["field_a", "field_b", "field_c"],
1938          ["yellow", "黄", "12"],
1939          ["blue", "青", "12"],
1940          ["red", "赤", "23.8"],
1941          ["black", "黒", "0.983"],
1942          ["white", "白", "1.65"],
1943          ["green", "緑", "0.0072"]];
1944 
1945     string[][] data3x6ExpectedReplaceNum10 =
1946         [["field_a", "field_b", "field_c"],
1947          ["black", "黒", "0.983"],
1948          ["green", "緑", "0.0072"],
1949          ["green", "緑", "0.0072"],
1950          ["red", "赤", "23.8"],
1951          ["yellow", "黄", "12"],
1952          ["red", "赤", "23.8"],
1953          ["white", "白", "1.65"],
1954          ["yellow", "黄", "12"],
1955          ["yellow", "黄", "12"],
1956          ["white", "白", "1.65"],
1957         ];
1958 
1959     string[][] data3x6ExpectedReplaceNum10V77 =
1960         [["field_a", "field_b", "field_c"],
1961          ["black", "黒", "0.983"],
1962          ["red", "赤", "23.8"],
1963          ["black", "黒", "0.983"],
1964          ["yellow", "黄", "12"],
1965          ["green", "緑", "0.0072"],
1966          ["green", "緑", "0.0072"],
1967          ["green", "緑", "0.0072"],
1968          ["yellow", "黄", "12"],
1969          ["blue", "青", "12"],
1970          ["white", "白", "1.65"],
1971         ];
1972 
1973     /* Using a different static seed. */
1974     string[][] data3x6ExpectedPermuteCompatV41Probs =
1975         [["random_value", "field_a", "field_b", "field_c"],
1976          ["0.68057272653095424", "green", "緑", "0.0072"],
1977          ["0.67681624367833138", "blue", "青", "12"],
1978          ["0.32097338931635022", "yellow", "黄", "12"],
1979          ["0.25092361867427826", "red", "赤", "23.8"],
1980          ["0.15535934292711318", "black", "黒", "0.983"],
1981          ["0.046095821075141430", "white", "白", "1.65"]];
1982 
1983     string[][] data3x6ExpectedBernoulliCompatP60V41Probs =
1984         [["random_value", "field_a", "field_b", "field_c"],
1985          ["0.25092361867427826", "red", "赤", "23.8"],
1986          ["0.046095821075141430", "white", "白", "1.65"],
1987          ["0.32097338931635022", "yellow", "黄", "12"],
1988          ["0.15535934292711318", "black", "黒", "0.983"]];
1989 
1990     string[][] data3x6ExpectedPermuteWt3V41Probs =
1991         [["random_value", "field_a", "field_b", "field_c"],
1992          ["0.96799377498910666", "blue", "青", "12"],
1993          ["0.94356245792573568", "red", "赤", "23.8"],
1994          ["0.90964601024271996", "yellow", "黄", "12"],
1995          ["0.15491658409260103", "white", "白", "1.65"],
1996          ["0.15043620392537033", "black", "黒", "0.983"],
1997          ["6.1394674830701461e-24", "green", "緑", "0.0072"]];
1998 
1999     string[][] data3x6ExpectedWt3V41ProbsInorder =
2000         [["random_value", "field_a", "field_b", "field_c"],
2001          ["0.94356245792573568", "red", "赤", "23.8"],
2002          ["6.1394674830701461e-24", "green", "緑", "0.0072"],
2003          ["0.15491658409260103", "white", "白", "1.65"],
2004          ["0.90964601024271996", "yellow", "黄", "12"],
2005          ["0.96799377498910666", "blue", "青", "12"],
2006          ["0.15043620392537033", "black", "黒", "0.983"]];
2007 
2008 
2009     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
2010     string[][] combo1ExpectedPermuteCompat =
2011         [["field_a", "field_b", "field_c"],
2012          ["yellow", "黄", "12"],
2013          ["tan", "タン", "8.5"],
2014          ["brown", "褐色", "29.2"],
2015          ["green", "緑", "0.0072"],
2016          ["red", "赤", "23.8"],
2017          ["purple", "紫の", "42"],
2018          ["black", "黒", "0.983"],
2019          ["white", "白", "1.65"],
2020          ["gray", "グレー", "6.2"],
2021          ["blue", "青", "12"],
2022          ["pink", "ピンク", "1.1"],
2023          ["orange", "オレンジ", "2.5"]];
2024 
2025     string[][] combo1ExpectedPermuteCompatProbs =
2026         [["random_value", "field_a", "field_b", "field_c"],
2027          ["0.97088520275428891", "yellow", "黄", "12"],
2028          ["0.96055546286515892", "tan", "タン", "8.5"],
2029          ["0.81756894313730299", "brown", "褐色", "29.2"],
2030          ["0.75710153928957880", "green", "緑", "0.0072"],
2031          ["0.52525980887003243", "red", "赤", "23.8"],
2032          ["0.49287854949943721", "purple", "紫の", "42"],
2033          ["0.47081507067196071", "black", "黒", "0.983"],
2034          ["0.38388182921335101", "white", "白", "1.65"],
2035          ["0.29215990612283349", "gray", "グレー", "6.2"],
2036          ["0.24033216014504433", "blue", "青", "12"],
2037          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2038          ["0.010968807619065046", "orange", "オレンジ", "2.5"]];
2039 
2040     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
2041     string[][] combo1ExpectedProbsInorder =
2042         [["random_value", "field_a", "field_b", "field_c"],
2043          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
2044          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2045          ["0.49287854949943721", "purple", "紫の", "42"],
2046          ["0.96055546286515892", "tan", "タン", "8.5"],
2047          ["0.52525980887003243", "red", "赤", "23.8"],
2048          ["0.75710153928957880", "green", "緑", "0.0072"],
2049          ["0.38388182921335101", "white", "白", "1.65"],
2050          ["0.97088520275428891", "yellow", "黄", "12"],
2051          ["0.24033216014504433", "blue", "青", "12"],
2052          ["0.47081507067196071", "black", "黒", "0.983"],
2053          ["0.81756894313730299", "brown", "褐色", "29.2"],
2054          ["0.29215990612283349", "gray", "グレー", "6.2"]];
2055 
2056     string[][] combo1ExpectedBernoulliCompatP50Probs =
2057         [["random_value", "field_a", "field_b", "field_c"],
2058          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
2059          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2060          ["0.49287854949943721", "purple", "紫の", "42"],
2061          ["0.38388182921335101", "white", "白", "1.65"],
2062          ["0.24033216014504433", "blue", "青", "12"],
2063          ["0.47081507067196071", "black", "黒", "0.983"],
2064          ["0.29215990612283349", "gray", "グレー", "6.2"]];
2065 
2066     string[][] combo1ExpectedBernoulliCompatP40 =
2067         [["field_a", "field_b", "field_c"],
2068          ["orange", "オレンジ", "2.5"],
2069          ["pink", "ピンク", "1.1"],
2070          ["white", "白", "1.65"],
2071          ["blue", "青", "12"],
2072          ["gray", "グレー", "6.2"]];
2073 
2074     string[][] combo1ExpectedDistinctK1P40 =
2075         [["field_a", "field_b", "field_c"],
2076          ["orange", "オレンジ", "2.5"],
2077          ["red", "赤", "23.8"],
2078          ["green", "緑", "0.0072"],
2079          ["blue", "青", "12"],
2080          ["black", "黒", "0.983"]];
2081 
2082     string[][] combo1ExpectedPermuteWt3Probs =
2083         [["random_value", "field_a", "field_b", "field_c"],
2084          ["0.99754077523718754", "yellow", "黄", "12"],
2085          ["0.99527665440088786", "tan", "タン", "8.5"],
2086          ["0.99312578945741659", "brown", "褐色", "29.2"],
2087          ["0.98329602553389361", "purple", "紫の", "42"],
2088          ["0.97330961938083660", "red", "赤", "23.8"],
2089          ["0.88797551521739648", "blue", "青", "12"],
2090          ["0.81999230489041786", "gray", "グレー", "6.2"],
2091          ["0.55975569204250941", "white", "白", "1.65"],
2092          ["0.46472135609205739", "black", "黒", "0.983"],
2093          ["0.18824582704191337", "pink", "ピンク", "1.1"],
2094          ["0.16446131853299920", "orange", "オレンジ", "2.5"],
2095          ["1.6438086931020549e-17", "green", "緑", "0.0072"]];
2096 
2097     string[][] combo1ExpectedPermuteWt3 =
2098         [["field_a", "field_b", "field_c"],
2099          ["yellow", "黄", "12"],
2100          ["tan", "タン", "8.5"],
2101          ["brown", "褐色", "29.2"],
2102          ["purple", "紫の", "42"],
2103          ["red", "赤", "23.8"],
2104          ["blue", "青", "12"],
2105          ["gray", "グレー", "6.2"],
2106          ["white", "白", "1.65"],
2107          ["black", "黒", "0.983"],
2108          ["pink", "ピンク", "1.1"],
2109          ["orange", "オレンジ", "2.5"],
2110          ["green", "緑", "0.0072"]];
2111 
2112         string[][] combo1ExpectedPermuteAlgoRNum4 =
2113         [["field_a", "field_b", "field_c"],
2114          ["blue", "青", "12"],
2115          ["gray", "グレー", "6.2"],
2116          ["brown", "褐色", "29.2"],
2117          ["white", "白", "1.65"]];
2118 
2119     string[][] combo1ExpectedReplaceNum10 =
2120         [["field_a", "field_b", "field_c"],
2121          ["gray", "グレー", "6.2"],
2122          ["yellow", "黄", "12"],
2123          ["yellow", "黄", "12"],
2124          ["white", "白", "1.65"],
2125          ["tan", "タン", "8.5"],
2126          ["white", "白", "1.65"],
2127          ["blue", "青", "12"],
2128          ["black", "黒", "0.983"],
2129          ["tan", "タン", "8.5"],
2130          ["purple", "紫の", "42"]];
2131 
2132     /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */
2133     string[][] data1x200 =
2134         [["field_a"],
2135          ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"],
2136          ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"],
2137          ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"],
2138          ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"],
2139          ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"],
2140          ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"],
2141          ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"],
2142          ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"],
2143          ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"],
2144          ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"],
2145          ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"],
2146          ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"],
2147          ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"],
2148          ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"],
2149          ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"],
2150          ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"],
2151          ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"],
2152          ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"],
2153          ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"],
2154          ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"],
2155         ];
2156 
2157     string fpath_data1x200 = buildPath(testDir, "data1x200.tsv");
2158     string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv");
2159     writeUnittestTsvFile(fpath_data1x200, data1x200);
2160     writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1..$]);
2161 
2162     string[][] data1x200ExpectedBernoulliSkipV333P01 =
2163         [["field_a"],
2164          ["077"],
2165          ["119"]];
2166 
2167     string[][] data1x200ExpectedBernoulliSkipV333P02 =
2168         [["field_a"],
2169          ["038"],
2170          ["059"],
2171          ["124"],
2172          ["161"],
2173          ["162"],
2174          ["183"]];
2175 
2176     string[][] data1x200ExpectedBernoulliSkipV333P03 =
2177         [["field_a"],
2178          ["025"],
2179          ["039"],
2180          ["082"],
2181          ["107"],
2182          ["108"],
2183          ["122"],
2184          ["136"],
2185          ["166"],
2186          ["182"]];
2187 
2188     string[][] data1x200ExpectedBernoulliCompatV333P01 =
2189         [["field_a"],
2190          ["072"]];
2191 
2192     string[][] data1x200ExpectedBernoulliCompatV333P02 =
2193         [["field_a"],
2194          ["004"],
2195          ["072"]];
2196 
2197     string[][] data1x200ExpectedBernoulliCompatV333P03 =
2198         [["field_a"],
2199          ["004"],
2200          ["072"],
2201          ["181"]];
2202 
2203     /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files,
2204      * only expected results. The header is from 3x0, the results are offset 1-position
2205      * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line.
2206      */
2207     string[][] combo2ExpectedBernoulliSkipV333P03 =
2208         [["field_a", "field_b", "field_c"],
2209          ["024"],
2210          ["038"],
2211          ["081"],
2212          ["106"],
2213          ["107"],
2214          ["121"],
2215          ["135"],
2216          ["165"],
2217          ["181"]];
2218 
2219 
2220     /* 1x10 - Simple 1-column file. */
2221     string[][] data1x10 =
2222         [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]];
2223     string fpath_data1x10 = buildPath(testDir, "data1x10.tsv");
2224     string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv");
2225     writeUnittestTsvFile(fpath_data1x10, data1x10);
2226     writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1..$]);
2227 
2228     string[][] data1x10ExpectedPermuteCompat =
2229         [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]];
2230 
2231     string[][] data1x10ExpectedPermuteWt1 =
2232         [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]];
2233 
2234     /* 2x10a - Uniform distribution [0,1]. */
2235     string[][] data2x10a =
2236         [["line", "weight"],
2237          ["1", "0.26788837"],
2238          ["2", "0.06601298"],
2239          ["3", "0.38627527"],
2240          ["4", "0.47379424"],
2241          ["5", "0.02966641"],
2242          ["6", "0.05636231"],
2243          ["7", "0.70529242"],
2244          ["8", "0.91836862"],
2245          ["9", "0.99103720"],
2246          ["10", "0.31401740"]];
2247 
2248     string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv");
2249     writeUnittestTsvFile(fpath_data2x10a, data2x10a);
2250 
2251     string[][] data2x10aExpectedPermuteWt2Probs =
2252         [["random_value", "line", "weight"],
2253          ["0.96833865494543658", "8", "0.91836862"],
2254          ["0.91856842054413923", "4", "0.47379424"],
2255          ["0.25730832087795091", "7", "0.70529242"],
2256          ["0.23725317907018120", "9", "0.99103720"],
2257          ["0.16016096701872204", "3", "0.38627527"],
2258          ["0.090819662667243381", "10", "0.31401740"],
2259          ["0.0071764539244361172", "6", "0.05636231"],
2260          ["0.000000048318642951630057", "1", "0.26788837"],
2261          ["0.00000000037525692966535517", "5", "0.02966641"],
2262          ["8.2123247880095796e-13", "2", "0.06601298"]];
2263 
2264     /* 2x10b - Uniform distribution [0,1000]. */
2265     string[][] data2x10b =
2266         [["line", "weight"],
2267          ["1", "761"],
2268          ["2", "432"],
2269          ["3", "103"],
2270          ["4", "448"],
2271          ["5", "750"],
2272          ["6", "711"],
2273          ["7", "867"],
2274          ["8", "841"],
2275          ["9", "963"],
2276          ["10", "784"]];
2277 
2278     string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv");
2279     writeUnittestTsvFile(fpath_data2x10b, data2x10b);
2280 
2281     string[][] data2x10bExpectedPermuteWt2Probs =
2282         [["random_value", "line", "weight"],
2283          ["0.99996486739067969", "8", "841"],
2284          ["0.99991017467137211", "4", "448"],
2285          ["0.99960871524873662", "6", "711"],
2286          ["0.99914188537143800", "5", "750"],
2287          ["0.99903963250274785", "10", "784"],
2288          ["0.99889631825931946", "7", "867"],
2289          ["0.99852058315191139", "9", "963"],
2290          ["0.99575669679158918", "2", "432"],
2291          ["0.99408758732050595", "1", "761"],
2292          ["0.99315467761212362", "3", "103"]];
2293 
2294     /* 2x10c - Logarithmic distribution in random order. */
2295     string[][] data2x10c =
2296         [["line", "weight"],
2297          ["1", "31.85"],
2298          ["2", "17403.31"],
2299          ["3", "653.84"],
2300          ["4", "8.23"],
2301          ["5", "2671.04"],
2302          ["6", "26226.08"],
2303          ["7", "1.79"],
2304          ["8", "354.56"],
2305          ["9", "35213.81"],
2306          ["10", "679.29"]];
2307 
2308     string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv");
2309     writeUnittestTsvFile(fpath_data2x10c, data2x10c);
2310 
2311     string[][] data2x10cExpectedPermuteWt2Probs =
2312         [["random_value", "line", "weight"],
2313          ["0.99998939008709697", "6", "26226.08"],
2314          ["0.99995951291695517", "9", "35213.81"],
2315          ["0.99991666907613541", "8", "354.56"],
2316          ["0.99989445052186410", "2", "17403.31"],
2317          ["0.99975897602861630", "5", "2671.04"],
2318          ["0.99891852769877643", "3", "653.84"],
2319          ["0.99889167752782515", "10", "679.29"],
2320          ["0.99512207506850148", "4", "8.23"],
2321          ["0.86789371584259023", "1", "31.85"],
2322          ["0.58574438162915610", "7", "1.79"]];
2323 
2324     /* 2x10d. Logarithmic distribution in ascending order. */
2325     string[][] data2x10d =
2326         [["line", "weight"],
2327          ["1", "1.79"],
2328          ["2", "8.23"],
2329          ["3", "31.85"],
2330          ["4", "354.56"],
2331          ["5", "653.84"],
2332          ["6", "679.29"],
2333          ["7", "2671.04"],
2334          ["8", "17403.31"],
2335          ["9", "26226.08"],
2336          ["10", "35213.81"]];
2337 
2338     string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv");
2339     writeUnittestTsvFile(fpath_data2x10d, data2x10d);
2340 
2341     string[][] data2x10dExpectedPermuteWt2Probs =
2342         [["random_value", "line", "weight"],
2343          ["0.99999830221846353", "8", "17403.31"],
2344          ["0.99997860834041397", "10", "35213.81"],
2345          ["0.99994563828986716", "9", "26226.08"],
2346          ["0.99988650363575737", "4", "354.56"],
2347          ["0.99964161939190088", "7", "2671.04"],
2348          ["0.99959045338948649", "6", "679.29"],
2349          ["0.99901574490639788", "5", "653.84"],
2350          ["0.97803163304747431", "3", "31.85"],
2351          ["0.79994791806910948", "2", "8.23"],
2352          ["0.080374261239949119", "1", "1.79"]];
2353 
2354     /* 2x10e. Logarithmic distribution in descending order. */
2355     string[][] data2x10e =
2356         [["line", "weight"],
2357          ["1", "35213.81"],
2358          ["2", "26226.08"],
2359          ["3", "17403.31"],
2360          ["4", "2671.04"],
2361          ["5", "679.29"],
2362          ["6", "653.84"],
2363          ["7", "354.56"],
2364          ["8", "31.85"],
2365          ["9", "8.23"],
2366          ["10", "1.79"]];
2367     string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv");
2368     writeUnittestTsvFile(fpath_data2x10e, data2x10e);
2369 
2370     string[][] data2x10eExpectedPermuteWt2Probs =
2371         [["random_value", "line", "weight"],
2372          ["0.99998493348975237", "4", "2671.04"],
2373          ["0.99995934807202624", "3", "17403.31"],
2374          ["0.99992995739727453", "2", "26226.08"],
2375          ["0.99987185679245649", "1", "35213.81"],
2376          ["0.99957451563173938", "6", "653.84"],
2377          ["0.99907273650209583", "8", "31.85"],
2378          ["0.99905260312968946", "5", "679.29"],
2379          ["0.99730333650516401", "7", "354.56"],
2380          ["0.84093902435227808", "9", "8.23"],
2381          ["0.65650015926290028", "10", "1.79"]];
2382 
2383     /* Data sets for distinct sampling. */
2384     string[][] data5x25 =
2385         [["ID", "Shape", "Color", "Size", "Weight"],
2386          ["01", "circle", "red", "S", "10"],
2387          ["02", "circle", "black", "L", "20"],
2388          ["03", "square", "black", "L", "20"],
2389          ["04", "circle", "green", "L", "30"],
2390          ["05", "ellipse", "red", "S", "20"],
2391          ["06", "triangle", "red", "S", "10"],
2392          ["07", "triangle", "red", "L", "20"],
2393          ["08", "square", "black", "S", "10"],
2394          ["09", "circle", "black", "S", "20"],
2395          ["10", "square", "green", "L", "20"],
2396          ["11", "triangle", "red", "L", "20"],
2397          ["12", "circle", "green", "L", "30"],
2398          ["13", "ellipse", "red", "S", "20"],
2399          ["14", "circle", "green", "L", "30"],
2400          ["15", "ellipse", "red", "L", "30"],
2401          ["16", "square", "red", "S", "10"],
2402          ["17", "circle", "black", "L", "20"],
2403          ["18", "square", "red", "S", "20"],
2404          ["19", "square", "black", "L", "20"],
2405          ["20", "circle", "red", "S", "10"],
2406          ["21", "ellipse", "black", "L", "30"],
2407          ["22", "triangle", "red", "L", "30"],
2408          ["23", "circle", "green", "S", "20"],
2409          ["24", "square", "green", "L", "20"],
2410          ["25", "circle", "red", "S", "10"],
2411         ];
2412 
2413     string fpath_data5x25 = buildPath(testDir, "data5x25.tsv");
2414     string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv");
2415     writeUnittestTsvFile(fpath_data5x25, data5x25);
2416     writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1..$]);
2417 
2418     string[][] data5x25ExpectedDistinctK2P40 =
2419         [["ID", "Shape", "Color", "Size", "Weight"],
2420          ["03", "square", "black", "L", "20"],
2421          ["05", "ellipse", "red", "S", "20"],
2422          ["08", "square", "black", "S", "10"],
2423          ["10", "square", "green", "L", "20"],
2424          ["13", "ellipse", "red", "S", "20"],
2425          ["15", "ellipse", "red", "L", "30"],
2426          ["16", "square", "red", "S", "10"],
2427          ["18", "square", "red", "S", "20"],
2428          ["19", "square", "black", "L", "20"],
2429          ["21", "ellipse", "black", "L", "30"],
2430          ["24", "square", "green", "L", "20"],
2431         ];
2432 
2433     string[][] data5x25ExpectedDistinctK2K4P20 =
2434         [["ID", "Shape", "Color", "Size", "Weight"],
2435          ["03", "square", "black", "L", "20"],
2436          ["07", "triangle", "red", "L", "20"],
2437          ["08", "square", "black", "S", "10"],
2438          ["10", "square", "green", "L", "20"],
2439          ["11", "triangle", "red", "L", "20"],
2440          ["16", "square", "red", "S", "10"],
2441          ["18", "square", "red", "S", "20"],
2442          ["19", "square", "black", "L", "20"],
2443          ["22", "triangle", "red", "L", "30"],
2444          ["24", "square", "green", "L", "20"],
2445         ];
2446 
2447     string[][] data5x25ExpectedDistinctK2K3K4P20 =
2448         [["ID", "Shape", "Color", "Size", "Weight"],
2449          ["04", "circle", "green", "L", "30"],
2450          ["07", "triangle", "red", "L", "20"],
2451          ["09", "circle", "black", "S", "20"],
2452          ["11", "triangle", "red", "L", "20"],
2453          ["12", "circle", "green", "L", "30"],
2454          ["14", "circle", "green", "L", "30"],
2455          ["16", "square", "red", "S", "10"],
2456          ["18", "square", "red", "S", "20"],
2457          ["22", "triangle", "red", "L", "30"],
2458         ];
2459 
2460     /* Fields 2 and 4 from data5x25. Distinct rows should be the same for equiv keys. */
2461     string[][] data2x25 =
2462         [["Shape", "Size"],
2463          ["circle", "S"],
2464          ["circle", "L"],
2465          ["square", "L"],
2466          ["circle", "L"],
2467          ["ellipse", "S"],
2468          ["triangle", "S"],
2469          ["triangle", "L"],
2470          ["square", "S"],
2471          ["circle", "S"],
2472          ["square", "L"],
2473          ["triangle", "L"],
2474          ["circle", "L"],
2475          ["ellipse", "S"],
2476          ["circle", "L"],
2477          ["ellipse", "L"],
2478          ["square", "S"],
2479          ["circle", "L"],
2480          ["square", "S"],
2481          ["square", "L"],
2482          ["circle", "S"],
2483          ["ellipse", "L"],
2484          ["triangle", "L"],
2485          ["circle", "S"],
2486          ["square", "L"],
2487          ["circle", "S"],
2488         ];
2489 
2490     string fpath_data2x25 = buildPath(testDir, "data2x25.tsv");
2491     string fpath_data2x25_noheader = buildPath(testDir, "data2x25_noheader.tsv");
2492     writeUnittestTsvFile(fpath_data2x25, data2x25);
2493     writeUnittestTsvFile(fpath_data2x25_noheader, data2x25[1..$]);
2494 
2495     string[][] data2x25ExpectedDistinctK1K2P20 =
2496         [["Shape", "Size"],
2497          ["square", "L"],
2498          ["triangle", "L"],
2499          ["square", "S"],
2500          ["square", "L"],
2501          ["triangle", "L"],
2502          ["square", "S"],
2503          ["square", "S"],
2504          ["square", "L"],
2505          ["triangle", "L"],
2506          ["square", "L"],
2507         ];
2508 
2509     string[][] data1x25 =
2510         [["Shape-Size"],
2511          ["circle-S"],
2512          ["circle-L"],
2513          ["square-L"],
2514          ["circle-L"],
2515          ["ellipse-S"],
2516          ["triangle-S"],
2517          ["triangle-L"],
2518          ["square-S"],
2519          ["circle-S"],
2520          ["square-L"],
2521          ["triangle-L"],
2522          ["circle-L"],
2523          ["ellipse-S"],
2524          ["circle-L"],
2525          ["ellipse-L"],
2526          ["square-S"],
2527          ["circle-L"],
2528          ["square-S"],
2529          ["square-L"],
2530          ["circle-S"],
2531          ["ellipse-L"],
2532          ["triangle-L"],
2533          ["circle-S"],
2534          ["square-L"],
2535          ["circle-S"],
2536         ];
2537 
2538     string fpath_data1x25 = buildPath(testDir, "data1x25.tsv");
2539     string fpath_data1x25_noheader = buildPath(testDir, "data1x25_noheader.tsv");
2540     writeUnittestTsvFile(fpath_data1x25, data1x25);
2541     writeUnittestTsvFile(fpath_data1x25_noheader, data1x25[1..$]);
2542 
2543     string[][] data1x25ExpectedDistinctK1P20 =
2544         [["Shape-Size"],
2545          ["triangle-L"],
2546          ["square-S"],
2547          ["triangle-L"],
2548          ["ellipse-L"],
2549          ["square-S"],
2550          ["square-S"],
2551          ["ellipse-L"],
2552          ["triangle-L"],
2553         ];
2554 
2555     string[][] data1x25ExpectedDistinctK1P20Probs =
2556         [["random_value", "Shape-Size"],
2557          ["0", "triangle-L"],
2558          ["0", "square-S"],
2559          ["0", "triangle-L"],
2560          ["0", "ellipse-L"],
2561          ["0", "square-S"],
2562          ["0", "square-S"],
2563          ["0", "ellipse-L"],
2564          ["0", "triangle-L"],
2565         ];
2566 
2567     string[][] data1x25ExpectedDistinctK1P20ProbsInorder =
2568         [["random_value", "Shape-Size"],
2569          ["1", "circle-S"],
2570          ["4", "circle-L"],
2571          ["2", "square-L"],
2572          ["4", "circle-L"],
2573          ["2", "ellipse-S"],
2574          ["1", "triangle-S"],
2575          ["0", "triangle-L"],
2576          ["0", "square-S"],
2577          ["1", "circle-S"],
2578          ["2", "square-L"],
2579          ["0", "triangle-L"],
2580          ["4", "circle-L"],
2581          ["2", "ellipse-S"],
2582          ["4", "circle-L"],
2583          ["0", "ellipse-L"],
2584          ["0", "square-S"],
2585          ["4", "circle-L"],
2586          ["0", "square-S"],
2587          ["2", "square-L"],
2588          ["1", "circle-S"],
2589          ["0", "ellipse-L"],
2590          ["0", "triangle-L"],
2591          ["1", "circle-S"],
2592          ["2", "square-L"],
2593          ["1", "circle-S"],
2594         ];
2595 
2596     /*
2597      * Enough setup! Actually run some tests!
2598      */
2599 
2600     /* Permutations. Headers, static seed, compatibility mode. With weights and without. */
2601     testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty);
2602     testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0);
2603     testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1);
2604     testTsvSample(["test-a4", "-H", "-s", "--compatibility-mode", fpath_data3x2], data3x2PermuteCompat);
2605     testTsvSample(["test-a5", "-H", "-s", "--compatibility-mode", fpath_data3x3], data3x3ExpectedPermuteCompat);
2606     testTsvSample(["test-a6", "-H", "-s", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat);
2607     testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
2608     testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3);
2609     testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
2610     testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
2611     testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
2612     testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
2613     testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs);
2614 
2615     /* Permutations, without compatibility mode, or with both compatibility and printing. */
2616     testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty);
2617     testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0);
2618     testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1);
2619     testTsvSample(["test-aa4", "-H", "-s", fpath_data3x2], data3x2PermuteShuffle);
2620     testTsvSample(["test-aa5", "-H", "-s", fpath_data3x3], data3x3ExpectedPermuteSwap);
2621     testTsvSample(["test-aa6", "-H", "-s", fpath_data3x6], data3x6ExpectedPermuteSwap);
2622     testTsvSample(["test-aa7", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3);
2623     testTsvSample(["test-aa8", "-H", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
2624     testTsvSample(["test-aa9", "-H", "--seed-value", "41", "--print-random", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
2625 
2626     /* Reservoir sampling using Algorithm R.
2627      * (Note: reservoirSamplingViaHeap is tested later in the length-based iteration loops.)
2628      */
2629     testTsvSample(["test-aa10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty);
2630     testTsvSample(["test-aa11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty);
2631     testTsvSample(["test-aa12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x0], data3x0);
2632     testTsvSample(["test-aa13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x0], data3x0);
2633     testTsvSample(["test-aa14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x1], data3x1);
2634     testTsvSample(["test-aa15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x1], data3x1);
2635     testTsvSample(["test-aa16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum6);
2636     testTsvSample(["test-aa17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum6);
2637     testTsvSample(["test-aa18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum5);
2638     testTsvSample(["test-aa19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum4);
2639     testTsvSample(["test-aa20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum3);
2640     testTsvSample(["test-aa21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum2);
2641     testTsvSample(["test-aa22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum1);
2642 
2643     /* Bernoulli sampling cases. */
2644     testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty);
2645     testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0);
2646     testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1);
2647     testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6);
2648     testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6);
2649     testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
2650     testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60);
2651     testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliCompatP60);
2652     testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs);
2653 
2654     /* Bernoulli sampling with probabilities in skip sampling range or preferring skip sampling. */
2655     testTsvSample(["test-ab1", "-H", "--seed-value", "333", "--prob", "0.01", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P01);
2656     testTsvSample(["test-ab2", "-H", "--seed-value", "333", "--prob", "0.02", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P02);
2657     testTsvSample(["test-ab3", "-H", "--seed-value", "333", "--prob", "0.03", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03);
2658     testTsvSample(["test-ab4", "-H", "--seed-value", "333", "--prob", "0.01", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P01);
2659     testTsvSample(["test-ab5", "-H", "--seed-value", "333", "--prob", "0.02", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P02);
2660     testTsvSample(["test-ab6", "-H", "--seed-value", "333", "--prob", "0.03", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P03);
2661     testTsvSample(["test-ab7", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40);
2662 
2663     /* Distinct sampling cases. */
2664     testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty);
2665     testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0);
2666     testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1);
2667     testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6);
2668     testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctK1K3P60);
2669 
2670 
2671 
2672     /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling.
2673      * For weighted sampling, use the weighted cases, but with expected using the original ordering.
2674      */
2675     testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
2676     testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
2677     testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
2678                   data3x6ExpectedWt3ProbsInorder);
2679     testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
2680                   data3x6ExpectedWt3V41ProbsInorder);
2681     testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6],
2682                   data3x6ExpectedDistinctK1K3P60Probs);
2683     testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header",
2684                    "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctK1K3P60ProbsRVCustom);
2685     testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6],
2686                   data3x6ExpectedDistinctK2P2ProbsInorder);
2687 
2688     /* Simple random sampling with replacement. */
2689     testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty);
2690     testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty);
2691     testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0);
2692     testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0);
2693     testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplaceNum3);
2694     testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10);
2695     testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77);
2696 
2697     /* Permutations, compatibility mode, without headers. */
2698     testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1..$]);
2699     testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1..$]);
2700     testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1..$]);
2701     testTsvSample(["test-b4", "-s", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..$]);
2702     testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..$]);
2703     testTsvSample(["test-b6", "-s", "--weight-field", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..$]);
2704     testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..$]);
2705     testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1..$]);
2706     testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1..$]);
2707 
2708     /* Permutations, no headers, without compatibility mode, or with printing and compatibility mode. */
2709     testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1..$]);
2710     testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1..$]);
2711     testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1..$]);
2712     testTsvSample(["test-bb4", "-s", fpath_data3x6_noheader], data3x6ExpectedPermuteSwap[1..$]);
2713     testTsvSample(["test-bb5", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..$]);
2714     testTsvSample(["test-bb6", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..$]);
2715     testTsvSample(["test-bb7", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1..$]);
2716 
2717     /* Reservoir sampling using Algorithm R, no headers. */
2718     testTsvSample(["test-aa10", "--prefer-algorithm-r", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty);
2719     testTsvSample(["test-aa11", "--prefer-algorithm-r", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty);
2720     testTsvSample(["test-aa14", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x1_noheader], data3x1[1..$]);
2721     testTsvSample(["test-aa15", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x1_noheader], data3x1[1..$]);
2722     testTsvSample(["test-aa16", "--prefer-algorithm-r", "-s", "--num", "7", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum6[1..$]);
2723     testTsvSample(["test-aa17", "--prefer-algorithm-r", "-s", "--num", "6", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum6[1..$]);
2724     testTsvSample(["test-aa18", "--prefer-algorithm-r", "-s", "--num", "5", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum5[1..$]);
2725     testTsvSample(["test-aa19", "--prefer-algorithm-r", "-s", "--num", "4", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum4[1..$]);
2726     testTsvSample(["test-aa20", "--prefer-algorithm-r", "-s", "--num", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum3[1..$]);
2727     testTsvSample(["test-aa21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum2[1..$]);
2728     testTsvSample(["test-aa22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum1[1..$]);
2729 
2730     /* Bernoulli sampling cases. */
2731     testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1..$]);
2732     testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]);
2733     testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]);
2734     testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..$]);
2735     testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..$]);
2736     testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1..$]);
2737 
2738     /* Bernoulli sampling with probabilities in skip sampling range. */
2739     testTsvSample(["test-bb1", "-v", "333", "-p", "0.01", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P01[1..$]);
2740     testTsvSample(["test-bb2", "-v", "333", "-p", "0.02", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P02[1..$]);
2741     testTsvSample(["test-bb3", "-v", "333", "-p", "0.03", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..$]);
2742     testTsvSample(["test-bb4", "-v", "333", "-p", "0.01", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P01[1..$]);
2743     testTsvSample(["test-bb5", "-v", "333", "-p", "0.02", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P02[1..$]);
2744     testTsvSample(["test-bb6", "-v", "333", "-p", "0.03", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P03[1..$]);
2745     testTsvSample(["test-bb7", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1..$]);
2746 
2747     /* Distinct sampling cases. */
2748     testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1..$]);
2749     testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]);
2750     testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]);
2751     testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]);
2752 
2753     /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */
2754     testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..$]);
2755     testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1..$]);
2756     testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader],
2757                   data3x6ExpectedDistinctK1K3P60Probs[1..$]);
2758     testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader],
2759                   data3x6ExpectedDistinctK2P2ProbsInorder[1..$]);
2760 
2761     /* Simple random sampling with replacement. */
2762     testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty);
2763     testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty);
2764     testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplaceNum3[1..$]);
2765     testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10[1..$]);
2766     testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1..$]);
2767 
2768     /* Multi-file tests. */
2769     testTsvSample(["test-c1", "--header", "--static-seed", "--compatibility-mode",
2770                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2771                   combo1ExpectedPermuteCompat);
2772     testTsvSample(["test-c2", "--header", "--static-seed", "--print-random",
2773                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2774                   combo1ExpectedPermuteCompatProbs);
2775     testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3",
2776                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2777                   combo1ExpectedPermuteWt3Probs);
2778     testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", "--compatibility-mode",
2779                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2780                   combo1ExpectedPermuteWt3);
2781     testTsvSample(["test-c5", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4",
2782                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2783                   combo1ExpectedPermuteAlgoRNum4);
2784 
2785     /* Multi-file, no headers. */
2786     testTsvSample(["test-c6", "--static-seed", "--compatibility-mode",
2787                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2788                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2789                   combo1ExpectedPermuteCompat[1..$]);
2790     testTsvSample(["test-c7", "--static-seed", "--print-random",
2791                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2792                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2793                   combo1ExpectedPermuteCompatProbs[1..$]);
2794     testTsvSample(["test-c8", "--static-seed", "--print-random", "--weight-field", "3",
2795                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2796                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2797                   combo1ExpectedPermuteWt3Probs[1..$]);
2798     testTsvSample(["test-c9", "--static-seed", "--weight-field", "3", "--compatibility-mode",
2799                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2800                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2801                   combo1ExpectedPermuteWt3[1..$]);
2802     testTsvSample(["test-c10", "--static-seed", "--prefer-algorithm-r", "--num", "4",
2803                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2804                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2805                   combo1ExpectedPermuteAlgoRNum4[1..$]);
2806 
2807     /* Bernoulli sampling cases. */
2808     testTsvSample(["test-c11", "--header", "--static-seed", "--print-random", "--prob", ".5",
2809                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2810                   combo1ExpectedBernoulliCompatP50Probs);
2811     testTsvSample(["test-c12", "--header", "--static-seed", "--prob", ".4",
2812                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2813                   combo1ExpectedBernoulliCompatP40);
2814     testTsvSample(["test-c13", "--static-seed", "--print-random", "--prob", ".5",
2815                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2816                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2817                   combo1ExpectedBernoulliCompatP50Probs[1..$]);
2818     testTsvSample(["test-c14", "--static-seed", "--prob", ".4",
2819                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2820                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2821                   combo1ExpectedBernoulliCompatP40[1..$]);
2822 
2823     /* Bernoulli sampling with probabilities in skip sampling range. */
2824     testTsvSample(["test-cc1", "-H", "-v", "333", "-p", "0.03",
2825                    fpath_data3x0, fpath_data3x1, fpath_data1x200, fpath_dataEmpty, fpath_data1x10],
2826                   combo2ExpectedBernoulliSkipV333P03);
2827     testTsvSample(["test-cc1", "-v", "333", "-p", "0.03",
2828                    fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader],
2829                   combo2ExpectedBernoulliSkipV333P03[1..$]);
2830 
2831     /* Distinct sampling cases. */
2832     testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4",
2833                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2834                   combo1ExpectedDistinctK1P40);
2835     testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--prob", ".4",
2836                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2837                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2838                   combo1ExpectedDistinctK1P40[1..$]);
2839 
2840     /* Generating random weights. */
2841     testTsvSample(["test-c15", "--header", "--static-seed", "--gen-random-inorder",
2842                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2843                   combo1ExpectedProbsInorder);
2844     testTsvSample(["test-c16", "--static-seed", "--gen-random-inorder",
2845                    fpath_data3x3_noheader, fpath_data3x1_noheader,
2846                    fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader],
2847                   combo1ExpectedProbsInorder[1..$]);
2848 
2849     /* Simple random sampling with replacement. */
2850     testTsvSample(["test-c17", "--header", "--static-seed", "--replace", "--num", "10",
2851                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2852                   combo1ExpectedReplaceNum10);
2853 
2854     testTsvSample(["test-c18", "--static-seed", "--replace", "--num", "10",
2855                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2856                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2857                   combo1ExpectedReplaceNum10[1..$]);
2858 
2859     /* Single column file. */
2860     testTsvSample(["test-d1", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat);
2861     testTsvSample(["test-d2", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat);
2862 
2863     /* Distributions. */
2864     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs);
2865     testTsvSample(["test-e2", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedPermuteWt2Probs);
2866     testTsvSample(["test-e3", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedPermuteWt2Probs);
2867     testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs);
2868     testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs);
2869 
2870     /* Tests of subset sample (--n|num) field.
2871      *
2872      * Note: The way these tests are done ensures that subset length does not affect
2873      * output order.
2874      */
2875     import std.algorithm : min;
2876     for (size_t n = data3x6.length + 2; n >= 1; n--)
2877     {
2878         /* reservoirSamplingViaHeap.
2879          */
2880         size_t expectedLength = min(data3x6.length, n + 1);
2881         testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string,
2882                        "-H", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]);
2883 
2884         testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string,
2885                        "-H", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]);
2886 
2887         testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string,
2888                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs[0..expectedLength]);
2889 
2890         testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string,
2891                        "-H", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3[0..expectedLength]);
2892 
2893         testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string,
2894                        "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs[0..expectedLength]);
2895 
2896         testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string,
2897                        fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..expectedLength]);
2898 
2899         testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string,
2900                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..expectedLength]);
2901 
2902         testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string,
2903                        "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..expectedLength]);
2904 
2905         testTsvSample([format("test-f9_%d", n), "-s", "-n", n.to!string,
2906                        "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..expectedLength]);
2907 
2908         /* Bernoulli sampling.
2909          */
2910         import std.algorithm : min;
2911         size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedBernoulliCompatProbsP60.length);
2912 
2913         testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
2914                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60[0..sampleExpectedLength]);
2915 
2916         testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
2917                        "-H", fpath_data3x6], data3x6ExpectedBernoulliCompatP60[0..sampleExpectedLength]);
2918 
2919         testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
2920                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..sampleExpectedLength]);
2921 
2922         testTsvSample([format("test-f13_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
2923                        fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60[1..sampleExpectedLength]);
2924 
2925         /* Distinct Sampling.
2926          */
2927         size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctK1K3P60.length);
2928 
2929         testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
2930                        "-H", fpath_data3x6], data3x6ExpectedDistinctK1K3P60[0..distinctExpectedLength]);
2931 
2932         testTsvSample([format("test-f15_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
2933                        fpath_data3x6_noheader], data3x6ExpectedDistinctK1K3P60[1..distinctExpectedLength]);
2934 
2935         testTsvSample([format("test-f16_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
2936                        "-H", fpath_data3x6], data3x6ExpectedBernoulliProbsP100[0..expectedLength]);
2937 
2938         testTsvSample([format("test-f17_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
2939                        fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]);
2940     }
2941 
2942     /* Similar tests with the 1x10 data set. */
2943     for (size_t n = data1x10.length + 2; n >= 1; n--)
2944     {
2945         size_t expectedLength = min(data1x10.length, n + 1);
2946         testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string,
2947                        "-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]);
2948 
2949         testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string,
2950                        "-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]);
2951 
2952         testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string,
2953                        fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]);
2954 
2955         testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string,
2956                        "-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]);
2957     }
2958 
2959     /* Simple random sampling with replacement: ensure sample size doesn't change order. */
2960     for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--)
2961     {
2962         testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6],
2963                       data3x6ExpectedReplaceNum10[0 .. n + 1]);
2964 
2965         testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader],
2966                       data3x6ExpectedReplaceNum10[1 .. n + 1]);
2967     }
2968 
2969     /* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */
2970     for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--)
2971     {
2972         size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1);
2973 
2974         testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
2975                        "-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]);
2976 
2977         testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
2978                        fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]);
2979 }
2980 
2981 
2982     /* Distinct sampling tests. */
2983     testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25],
2984                   data5x25ExpectedDistinctK2P40);
2985 
2986     testTsvSample(["test-j2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25],
2987                   data5x25ExpectedDistinctK2K4P20);
2988 
2989     testTsvSample(["test-j3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25],
2990                   data5x25ExpectedDistinctK2K3K4P20);
2991 
2992     testTsvSample(["test-j4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader],
2993                   data5x25ExpectedDistinctK2P40[1..$]);
2994 
2995     testTsvSample(["test-j5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader],
2996                   data5x25ExpectedDistinctK2K4P20[1..$]);
2997 
2998     testTsvSample(["test-j6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader],
2999                   data5x25ExpectedDistinctK2K3K4P20[1..$]);
3000 
3001 
3002     /* These distinct tests check that the whole line as '-k 0' and specifying all fields
3003      * in order have the same result. Also that field numbers don't matter, as '-k 1,2'
3004      * in data2x25 are the same keys as '-k 2,4' in data5x25.
3005      */
3006     testTsvSample(["test-j7", "-H", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25],
3007                   data2x25ExpectedDistinctK1K2P20);
3008 
3009     testTsvSample(["test-j8", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data2x25],
3010                   data2x25ExpectedDistinctK1K2P20);
3011 
3012     testTsvSample(["test-j9", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25_noheader],
3013                   data2x25ExpectedDistinctK1K2P20[1..$]);
3014 
3015     testTsvSample(["test-j10", "-s", "-p", "0.20", "-k", "0", fpath_data2x25_noheader],
3016                   data2x25ExpectedDistinctK1K2P20[1..$]);
3017 
3018     /* Similar to the last set, but for a 1-column file. Also with random value printing. */
3019     testTsvSample(["test-j11", "-H", "-s", "-p", "0.20", "-k", "1", fpath_data1x25],
3020                   data1x25ExpectedDistinctK1P20);
3021 
3022     testTsvSample(["test-j12", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data1x25],
3023                   data1x25ExpectedDistinctK1P20);
3024 
3025     testTsvSample(["test-j13", "-s", "-p", "0.20", "-k", "1", fpath_data1x25_noheader],
3026                   data1x25ExpectedDistinctK1P20[1..$]);
3027 
3028     testTsvSample(["test-j14", "-s", "-p", "0.20", "-k", "0", fpath_data1x25_noheader],
3029                   data1x25ExpectedDistinctK1P20[1..$]);
3030 
3031 
3032     testTsvSample(["test-j15", "-H", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25],
3033                   data1x25ExpectedDistinctK1P20Probs);
3034 
3035     testTsvSample(["test-j16", "-H", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25],
3036                   data1x25ExpectedDistinctK1P20Probs);
3037 
3038     testTsvSample(["test-j17", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25_noheader],
3039                   data1x25ExpectedDistinctK1P20Probs[1..$]);
3040 
3041     testTsvSample(["test-j18", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25_noheader],
3042                   data1x25ExpectedDistinctK1P20Probs[1..$]);
3043 
3044 
3045     testTsvSample(["test-j19", "-H", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25],
3046                   data1x25ExpectedDistinctK1P20ProbsInorder);
3047 
3048     testTsvSample(["test-j20", "-H", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25],
3049                   data1x25ExpectedDistinctK1P20ProbsInorder);
3050 
3051     testTsvSample(["test-j21", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25_noheader],
3052                   data1x25ExpectedDistinctK1P20ProbsInorder[1..$]);
3053 
3054     testTsvSample(["test-j22", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25_noheader],
3055                   data1x25ExpectedDistinctK1P20ProbsInorder[1..$]);
3056 
3057 }