1 /**
2 Command line tool for shuffling or sampling lines from input streams. Several methods
3 are available, including weighted and unweighted shuffling, simple and weighted random
4 sampling, sampling with replacement, Bernoulli sampling, and distinct sampling.
5 
6 Copyright (c) 2017-2020, eBay Inc.
7 Initially written by Jon Degenhardt
8 
9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
10 */
11 module tsv_utils.tsv_sample;
12 
13 import std.array : appender, Appender, RefAppender;
14 import std.range;
15 import std.stdio;
16 import std.typecons : tuple, Flag;
17 
18 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
19 
20 version(unittest)
21 {
22     // When running unit tests, use main from -main compiler switch.
23 }
24 else
25 {
26     /** Main program.
27      *
28      * Invokes command line argument processing and calls tsvSample to do the real
29      * work. Errors occurring during processing are caught and reported to the user.
30      */
31     int main(string[] cmdArgs)
32     {
33         /* When running in DMD code coverage mode, turn on report merging. */
34         version(D_Coverage) version(DigitalMars)
35         {
36             import core.runtime : dmd_coverSetMerge;
37             dmd_coverSetMerge(true);
38         }
39 
40         TsvSampleOptions cmdopt;
41         const r = cmdopt.processArgs(cmdArgs);
42         if (!r[0]) return r[1];
43         version(LDC_Profile)
44         {
45             import ldc.profile : resetAll;
46             resetAll();
47         }
48         try
49         {
50             import tsv_utils.common.utils : BufferedOutputRange;
51             auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
52 
53             tsvSample(cmdopt, bufferedOutput);
54         }
55         catch (Exception exc)
56         {
57             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
58             return 1;
59         }
60         return 0;
61     }
62 }
63 
64 immutable helpText = q"EOS
65 Synopsis: tsv-sample [options] [file...]
66 
67 Sample input lines or randomize their order. Several modes of operation
68 are available:
69 * Shuffling (the default): All input lines are output in random order. All
70   orderings are equally likely.
71 * Random sampling (--n|num N): A random sample of N lines are selected and
72   written to standard output. By default, selected lines are written in
73   random order. All sample sets and orderings are equally likely. Use
74   --i|inorder to write the selected lines in the original input order.
75 * Weighted random sampling (--n|num N, --w|weight-field F): A weighted
76   sample of N lines is produced. Weights are taken from field F. Lines are
77   output in weighted selection order. Use --i|inorder to write in original
78   input order. Omit --n|num to shuffle all lines (weighted shuffling).
79 * Sampling with replacement (--r|replace, --n|num N): All input lines are
80   read in, then lines are repeatedly selected at random and written out.
81   This continues until N lines are output. Individual lines can be written
82   multiple times. Output continues forever if N is zero or not provided.
83 * Bernoulli sampling (--p|prob P): A random subset of lines is selected
84   based on probability P, a 0.0-1.0 value. This is a streaming operation.
85   A decision is made on each line as it is read. Line order is not changed.
86 * Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled
87   based on the values in the key fields. A subset of keys are chosen based
88   on the inclusion probability (a 'distinct' set of keys). All lines with
89   one of the selected keys are output. Line order is not changed.
90 
91 Use '--help-verbose' for detailed information.
92 
93 Options:
94 EOS";
95 
96 immutable helpTextVerbose = q"EOS
97 Synopsis: tsv-sample [options] [file...]
98 
99 Sample input lines or randomize their order. Several modes of operation
100 are available:
101 * Shuffling (the default): All input lines are output in random order. All
102   orderings are equally likely.
103 * Random sampling (--n|num N): A random sample of N lines are selected and
104   written to standard output. By default, selected lines are written in
105   random order. All sample sets and orderings are equally likely. Use
106   --i|inorder to write the selected lines in the original input order.
107 * Weighted random sampling (--n|num N, --w|weight-field F): A weighted
108   sample of N lines is produced. Weights are taken from field F. Lines are
109   output in weighted selection order. Use --i|inorder to write in original
110   input order. Omit --n|num to shuffle all lines (weighted shuffling).
111 * Sampling with replacement (--r|replace, --n|num N): All input lines are
112   read in, then lines are repeatedly selected at random and written out.
113   This continues until N lines are output. Individual lines can be written
114   multiple times. Output continues forever if N is zero or not provided.
115 * Bernoulli sampling (--p|prob P): A random subset of lines is selected
116   based on probability P, a 0.0-1.0 value. This is a streaming operation.
117   A decision is made on each line as it is read. Line order is not changed.
118 * Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled
119   based on the values in the key fields. A subset of keys are chosen based
120   on the inclusion probability (a 'distinct' set of keys). All lines with
121   one of the selected keys are output. Line order is not changed.
122 
123 Sample size: The '--n|num' option controls the sample size for all
124 sampling methods. In the case of simple and weighted random sampling it
125 also limits the amount of memory required.
126 
127 Controlling the random seed: By default, each run produces a different
128 randomization or sampling. Using '--s|static-seed' changes this so
129 multiple runs produce the same results. This works by using the same
130 random seed each run. The random seed can be specified using
131 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero
132 value is a no-op and ignored.)
133 
134 Memory use: Bernoulli sampling and distinct sampling make decisions on
135 each line as it is read, there is no memory accumulation. These algorithms
136 can run on arbitrary size inputs. Sampling with replacement reads all
137 lines into memory and is limited by available memory. Shuffling also reads
138 all lines into memory and is similarly limited. Random sampling uses
139 reservoir sampling, and only needs to hold the sample size (--n|num) in
140 memory. The input data can be of any length.
141 
142 Weighted sampling: Weighted random sampling is done using an algorithm
143 described by Pavlos Efraimidis and Paul Spirakis. Weights should be
144 positive values representing the relative weight of the entry in the
145 collection. Counts and similar can be used as weights, it is *not*
146 necessary to normalize to a [0,1] interval. Negative values are not
147 meaningful and given the value zero. Input order is not retained, instead
148 lines are output ordered by the randomized weight that was assigned. This
149 means that a smaller valid sample can be produced by taking the first N
150 lines of output. For more info on the sampling approach see:
151 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling
152 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis
153   (https://arxiv.org/abs/1012.0256)
154 
155 Printing random values: Most of the sampling algorithms work by generating
156 a random value for each line. (See "Compatibility mode" below.) The nature
157 of these values depends on the sampling algorithm. They are used for both
158 line selection and output ordering. The '--p|print-random' option can be
159 used to print these values. The random value is prepended to the line
160 separated by the --d|delimiter char (TAB by default). The
161 '--gen-random-inorder' option takes this one step further, generating
162 random values for all input lines without changing the input order. The
163 types of values currently used by these sampling algorithms:
164 * Unweighted sampling: Uniform random value in the interval [0,1]. This
165   includes Bernoulli sampling and unweighted line order randomization.
166 * Weighted sampling: Value in the interval [0,1]. Distribution depends on
167   the values in the weight field. It is used as a partial ordering.
168 * Distinct sampling: An integer, zero and up, representing a selection
169   group. The inclusion probability determines the number of selection groups.
170 * Sampling with replacement: Random value printing is not supported.
171 
172 The specifics behind these random values are subject to change in future
173 releases.
174 
175 Compatibility mode: As described above, many of the sampling algorithms
176 assign a random value to each line. This is useful when printing random
177 values. It has another occasionally useful property: repeated runs with
178 the same static seed but different selection parameters are more
179 compatible with each other, as each line gets assigned the same random
180 value on every run. For example, if Bernoulli sampling is run with
181 '--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed',
182 all the lines selected in the first run will be selected in the second.
183 This comes at a cost: in some cases there are faster algorithms that don't
184 preserve this property. By default, tsv-sample will use faster algorithms
185 when available. However, the '--compatibility-mode' option switches to
186 algorithms that assign a random value per line. Printing random values
187 also engages compatibility mode.
188 
189 Options:
190 EOS";
191 
192 /** Container for command line options and derived data.
193  *
194  * TsvSampleOptions handles several aspects of command line options. On the input side,
195  * it defines the command line options available, performs validation, and sets up any
196  * derived state based on the options provided. These activities are handled by the
197  * processArgs() member.
198  *
199  * Once argument processing is complete, TsvSampleOptions is used as a container
200  * holding the specific processing options used by the different sampling routines.
201  */
202 struct TsvSampleOptions
203 {
204     string programName;                        /// Program name
205     string[] files;                            /// Input files
206     bool helpVerbose = false;                  /// --help-verbose
207     bool hasHeader = false;                    /// --H|header
208     ulong sampleSize = 0;                      /// --n|num - Size of the desired sample
209     double inclusionProbability = double.nan;  /// --p|prob - Inclusion probability
210     size_t[] keyFields;                        /// --k|key-fields - Used with inclusion probability
211     size_t weightField = 0;                    /// --w|weight-field - Field holding the weight
212     bool srsWithReplacement = false;           /// --r|replace
213     bool preserveInputOrder = false;           /// --i|inorder
214     bool staticSeed = false;                   /// --s|static-seed
215     uint seedValueOptionArg = 0;               /// --v|seed-value
216     bool printRandom = false;                  /// --print-random
217     bool genRandomInorder = false;             /// --gen-random-inorder
218     string randomValueHeader = "random_value"; /// --random-value-header
219     bool compatibilityMode = false;            /// --compatibility-mode
220     char delim = '\t';                         /// --d|delimiter
221     bool versionWanted = false;                /// --V|version
222     bool preferSkipSampling = false;           /// --prefer-skip-sampling
223     bool preferAlgorithmR = false;             /// --prefer-algorithm-r
224     bool hasWeightField = false;               /// Derived.
225     bool useBernoulliSampling = false;         /// Derived.
226     bool useDistinctSampling = false;          /// Derived.
227     bool distinctKeyIsFullLine = false;        /// Derived. True if '--k|key-fields 0' is specfied.
228     bool usingUnpredictableSeed = true;        /// Derived from --static-seed, --seed-value
229     uint seed = 0;                             /// Derived from --static-seed, --seed-value
230 
231     /** Process tsv-sample command line arguments.
232      *
233      * Defines the command line options, performs validation, and derives additional
234      * state. std.getopt.getopt is called to do the main option processing followed
235      * additional validation and derivation.
236      *
237      * Help text is printed to standard output if help was requested. Error text is
238      * written to stderr if invalid input is encountered.
239      *
240      * A tuple is returned. First value is true if command line arguments were
241      * successfully processed and execution should continue, or false if an error
242      * occurred or the user asked for help. If false, the second value is the
243      * appropriate exit code (0 or 1).
244      *
245      * Returning true (execution continues) means args have been validated and derived
246      * values calculated. Field indices will have been converted to zero-based.
247      */
248     auto processArgs(ref string[] cmdArgs)
249     {
250         import std.algorithm : any, canFind, each;
251         import std.getopt;
252         import std.math : isNaN;
253         import std.path : baseName, stripExtension;
254         import std.typecons : Yes, No;
255         import tsv_utils.common.utils : makeFieldListOptionHandler;
256 
257         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
258 
259         try
260         {
261             arraySep = ",";    // Use comma to separate values in command line options
262             auto r = getopt(
263                 cmdArgs,
264                 "help-verbose",    "     Print more detailed help.", &helpVerbose,
265 
266                 std.getopt.config.caseSensitive,
267                 "H|header",        "     Treat the first line of each file as a header.", &hasHeader,
268                 std.getopt.config.caseInsensitive,
269 
270                 "n|num",           "NUM  Maximum number of lines to output. All selected lines are output if not provided or zero.", &sampleSize,
271                 "p|prob",          "NUM  Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability,
272 
273                 "k|key-fields",    "<field-list>  Fields to use as key for distinct sampling. Use with '--p|prob'. Specify '--k|key-fields 0' to use the entire line as the key.",
274                 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
275 
276                 "w|weight-field",  "NUM  Field containing weights. All lines get equal weight if not provided or zero.", &weightField,
277                 "r|replace",       "     Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement,
278                 "i|inorder",       "     Output random samples in original input order. Requires use of --n|num.", &preserveInputOrder,
279                 "s|static-seed",   "     Use the same random seed every run.", &staticSeed,
280 
281                 std.getopt.config.caseSensitive,
282                 "v|seed-value",    "NUM  Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg,
283                 std.getopt.config.caseInsensitive,
284 
285                 "print-random",       "     Include the assigned random value (prepended) when writing output lines.", &printRandom,
286                 "gen-random-inorder", "     Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder,
287                 "random-value-header",  "     Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader,
288                 "compatibility-mode", "     Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode,
289 
290                 "d|delimiter",     "CHR  Field delimiter.", &delim,
291 
292                 std.getopt.config.caseSensitive,
293                 "V|version",       "     Print version information and exit.", &versionWanted,
294                 std.getopt.config.caseInsensitive,
295 
296                 "prefer-skip-sampling", "     (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.",
297                 &preferSkipSampling,
298 
299                 "prefer-algorithm-r",   "     (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.",
300                 &preferAlgorithmR,
301                 );
302 
303             if (r.helpWanted)
304             {
305                 defaultGetoptPrinter(helpText, r.options);
306                 return tuple(false, 0);
307             }
308             else if (helpVerbose)
309             {
310                 defaultGetoptPrinter(helpTextVerbose, r.options);
311                 return tuple(false, 0);
312             }
313             else if (versionWanted)
314             {
315                 import tsv_utils.common.tsvutils_version;
316                 writeln(tsvutilsVersionNotice("tsv-sample"));
317                 return tuple(false, 0);
318             }
319 
320             /* Derivations and validations. */
321             if (weightField > 0)
322             {
323                 hasWeightField = true;
324                 weightField--;    // Switch to zero-based indexes.
325             }
326 
327             if (srsWithReplacement)
328             {
329                 if (hasWeightField)
330                 {
331                     throw new Exception("Sampling with replacement (--r|replace) does not support weights (--w|weight-field).");
332                 }
333                 else if (!inclusionProbability.isNaN)
334                 {
335                     throw new Exception("Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob).");
336                 }
337                 else if (keyFields.length > 0)
338                 {
339                     throw new Exception("Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields).");
340                 }
341                 else if (printRandom || genRandomInorder)
342                 {
343                     throw new Exception("Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder).");
344                 }
345                 else if (preserveInputOrder)
346                 {
347                     throw new Exception("Sampling with replacement (--r|replace) does not support input order preservation (--i|inorder option).");
348                 }
349             }
350 
351             if (keyFields.length > 0)
352             {
353                 /* Note: useDistinctSampling is set as part of the inclusion probability checks below. */
354 
355                 if (inclusionProbability.isNaN) throw new Exception("--p|prob is required when using --k|key-fields.");
356 
357                 if (keyFields.length == 1 && keyFields[0] == 0)
358                 {
359                     distinctKeyIsFullLine = true;
360                 }
361                 else
362                 {
363                     if (keyFields.length > 1 && keyFields.any!(x => x == 0))
364                     {
365                         throw new Exception("Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.");
366                     }
367 
368                     keyFields.each!((ref x) => --x);  // Convert to zero-based indexing.
369                 }
370             }
371 
372             /* Inclusion probability (--p|prob) is used for both Bernoulli sampling and distinct sampling. */
373             if (!inclusionProbability.isNaN)
374             {
375                 if (inclusionProbability <= 0.0 || inclusionProbability > 1.0)
376                 {
377                     import std.format : format;
378                     throw new Exception(
379                         format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability));
380                 }
381 
382                 if (keyFields.length > 0) useDistinctSampling = true;
383                 else useBernoulliSampling = true;
384 
385                 if (hasWeightField) throw new Exception("--w|weight-field and --p|prob cannot be used together.");
386 
387                 if (genRandomInorder && !useDistinctSampling)
388                 {
389                     throw new Exception("--gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used.");
390                 }
391             }
392             else if (genRandomInorder && !hasWeightField)
393             {
394                 useBernoulliSampling = true;
395             }
396 
397             if (randomValueHeader.length == 0 || randomValueHeader.canFind('\n') ||
398                 randomValueHeader.canFind(delim))
399             {
400                 throw new Exception("--randomValueHeader must be at least one character and not contain field delimiters or newlines.");
401             }
402 
403             /* Check for incompatible use of (--i|inorder) and shuffling of the full
404              * data set. Sampling with replacement is also incompatible, this is
405              * detected earlier. Shuffling is the default operation, so it identified
406              * by eliminating the other modes of operation.
407              */
408             if (preserveInputOrder &&
409                 sampleSize == 0 &&
410                 !useBernoulliSampling &&
411                 !useDistinctSampling
412                )
413             {
414                 throw new Exception("Preserving input order (--i|inorder) is not compatible with full data set shuffling. Switch to random sampling with a sample size (--n|num) to use --i|inorder.");
415             }
416 
417             /* Compatibility mode checks:
418              * - Random value printing implies compatibility-mode, otherwise user's
419              *   selection is used.
420              * - Distinct sampling doesn't support compatibility-mode. The routines
421              *   don't care, but users might expect larger probabilities to be a
422              *   superset of smaller probabilities. This would be confusing, so
423              *   flag it as an error.
424              */
425             if (compatibilityMode && useDistinctSampling)
426             {
427                 throw new Exception("Distinct sampling (--k|key-fields --p|prob) does not support --compatibility-mode.");
428             }
429 
430             if (printRandom || genRandomInorder) compatibilityMode = true;
431 
432             /* Seed. */
433             import std.random : unpredictableSeed;
434 
435             usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0);
436 
437             if (usingUnpredictableSeed) seed = unpredictableSeed;
438             else if (seedValueOptionArg != 0) seed = seedValueOptionArg;
439             else if (staticSeed) seed = 2438424139;
440             else assert(0, "Internal error, invalid seed option states.");
441 
442             /* Assume remaining args are files. Use standard input if files were not provided. */
443             files ~= (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
444             cmdArgs.length = 1;
445         }
446         catch (Exception exc)
447         {
448             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
449             return tuple(false, 1);
450         }
451         return tuple(true, 0);
452     }
453 }
454 /** Invokes the appropriate sampling routine based on the command line arguments.
455  *
456  * tsvSample is the top-level routine handling the different tsv-sample use cases.
457  * Its primary role is to invoke the correct routine for type of sampling requested.
458  */
459 void tsvSample(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
460 if (isOutputRange!(OutputRange, char))
461 {
462     if (cmdopt.srsWithReplacement)
463     {
464         simpleRandomSamplingWithReplacement(cmdopt, outputStream);
465     }
466     else if (cmdopt.useBernoulliSampling)
467     {
468         bernoulliSamplingCommand(cmdopt, outputStream);
469     }
470     else if (cmdopt.useDistinctSampling)
471     {
472         if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
473         else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream);
474     }
475     else if (cmdopt.genRandomInorder)
476     {
477         /* Note that the preceding cases handle gen-random-inorder themselves (Bernoulli,
478          * Distinct), or don't handle it (SRS w/ Replacement).
479          */
480         assert(cmdopt.hasWeightField);
481         generateWeightedRandomValuesInorder(cmdopt, outputStream);
482     }
483     else if (cmdopt.sampleSize != 0)
484     {
485         randomSamplingCommand(cmdopt, outputStream);
486     }
487     else
488     {
489         shuffleCommand(cmdopt, outputStream);
490     }
491 }
492 
493 /** Bernoulli sampling command handler. Invokes the appropriate Bernoulli sampling
494  * routine based on the command line arguments.
495  *
496  * This routine selects the appropriate Bernoulli sampling function and template
497  * instantiation to use based on the command line arguments.
498  *
499  * One of the basic choices is whether to use the vanilla algorithm or skip sampling.
500  * Skip sampling is a little bit faster when the inclusion probability is small but
501  * doesn't support compatibility mode. See the bernoulliSkipSampling documentation
502  * for a discussion of the skipSamplingProbabilityThreshold used here.
503  */
504 void bernoulliSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
505 if (isOutputRange!(OutputRange, char))
506 {
507     assert(!cmdopt.hasWeightField);
508 
509     immutable double skipSamplingProbabilityThreshold = 0.04;
510 
511     if (cmdopt.compatibilityMode ||
512         (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling))
513     {
514         if (cmdopt.genRandomInorder)
515         {
516             bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
517         }
518         else
519         {
520             bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream);
521         }
522     }
523     else
524     {
525         bernoulliSkipSampling(cmdopt, outputStream);
526     }
527 }
528 
529 /** Bernoulli sampling of lines from the input stream.
530  *
531  * Each input line is a assigned a random value and output if less than
532  * cmdopt.inclusionProbability. The order of the lines is not changed.
533  *
534  * This routine supports random value printing and gen-random-inorder value printing.
535  */
536 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
537     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
538 if (isOutputRange!(OutputRange, char))
539 {
540     import std.random : Random = Mt19937, uniform01;
541     import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix;
542 
543     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
544     else assert(!cmdopt.genRandomInorder);
545 
546     auto randomGenerator = Random(cmdopt.seed);
547 
548     /* Process each line. */
549     bool headerWritten = false;
550     ulong numLinesWritten = 0;
551     foreach (filename; cmdopt.files)
552     {
553         auto inputStream = (filename == "-") ? stdin : filename.File();
554         foreach (ulong fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1))
555         {
556             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
557             if (fileLineNum == 1 && cmdopt.hasHeader)
558             {
559                 if (!headerWritten)
560                 {
561                     static if (generateRandomAll)
562                     {
563                         outputStream.put(cmdopt.randomValueHeader);
564                         outputStream.put(cmdopt.delim);
565                     }
566                     else if (cmdopt.printRandom)
567                     {
568                         outputStream.put(cmdopt.randomValueHeader);
569                         outputStream.put(cmdopt.delim);
570                     }
571 
572                     outputStream.put(line);
573                     outputStream.put("\n");
574                     headerWritten = true;
575                 }
576             }
577             else
578             {
579                 immutable double lineScore = uniform01(randomGenerator);
580 
581                 static if (generateRandomAll)
582                 {
583                     outputStream.formatRandomValue(lineScore);
584                     outputStream.put(cmdopt.delim);
585                     outputStream.put(line);
586                     outputStream.put("\n");
587 
588                     if (cmdopt.sampleSize != 0)
589                     {
590                         ++numLinesWritten;
591                         if (numLinesWritten == cmdopt.sampleSize) return;
592                     }
593                 }
594                 else if (lineScore < cmdopt.inclusionProbability)
595                 {
596                     if (cmdopt.printRandom)
597                     {
598                         outputStream.formatRandomValue(lineScore);
599                         outputStream.put(cmdopt.delim);
600                     }
601                     outputStream.put(line);
602                     outputStream.put("\n");
603 
604                     if (cmdopt.sampleSize != 0)
605                     {
606                         ++numLinesWritten;
607                         if (numLinesWritten == cmdopt.sampleSize) return;
608                     }
609                 }
610             }
611         }
612     }
613 }
614 
615 /** bernoulliSkipSampling is an implementation of Bernoulli sampling using skips.
616  *
617  * Skip sampling works by skipping a random number of lines between selections. This
618  * can be faster than assigning a random value to each line when the inclusion
619  * probability is low, as it reduces the number of calls to the random number
620  * generator. Both the random number generator and the log() function are called when
621  * calculating the next skip size. These additional log() calls add up as the
622  * inclusion probability increases.
623  *
624  * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for
625  * file-oriented line sampling. This is obviously environment specific. In the
626  * environments this implementation has been tested in the performance improvements
627  * remain small, less than 7%, even with an inclusion probability as low as 0.0001.
628  *
629  * The algorithm does not assign random values to individual lines. This makes it
630  * incompatible with random value printing. It is not suitable for compatibility mode
631  * either. As an example, in compatibility mode a line selected with '--prob 0.2' should
632  * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling
633  * does not have this property.
634  *
635  * The algorithm for calculating the skip size has been described by multiple sources.
636  * There are two key variants depending on whether the total number of lines in the
637  * data set is known in advance. (This implementation does not know the total.)
638  * Useful references:
639  * $(LIST
640  *     * Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling",
641  *       ACM Trans on Mathematical Software, 1987. On-line:
642  *       http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf
643  *     * P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book
644  *       "Data Stream Management", Springer-Verlag, 2016. On-line:
645  *       https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf
646  *     * Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line:
647  *       http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/
648  * )
649  */
650 void bernoulliSkipSampling(OutputRange)(TsvSampleOptions cmdopt, OutputRange outputStream)
651     if (isOutputRange!(OutputRange, char))
652 {
653     import std.conv : to;
654     import std.math : log, trunc;
655     import std.random : Random = Mt19937, uniform01;
656     import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix;
657 
658     assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0);
659     assert(!cmdopt.printRandom);
660     assert(!cmdopt.compatibilityMode);
661 
662     auto randomGenerator = Random(cmdopt.seed);
663 
664     immutable double discardRate = 1.0 - cmdopt.inclusionProbability;
665     immutable double logDiscardRate = log(discardRate);
666 
667     /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed
668      * interval to (0.0, 1.0], excluding 0.0.
669      */
670     size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
671 
672     /* Process each line. */
673     bool headerWritten = false;
674     ulong numLinesWritten = 0;
675     foreach (filename; cmdopt.files)
676     {
677         auto inputStream = (filename == "-") ? stdin : filename.File();
678         foreach (ulong fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1))
679         {
680             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
681             if (fileLineNum == 1 && cmdopt.hasHeader)
682             {
683                 if (!headerWritten)
684                 {
685                     outputStream.put(line);
686                     outputStream.put("\n");
687                     headerWritten = true;
688                 }
689             }
690             else if (remainingSkips > 0)
691             {
692                 --remainingSkips;
693             }
694             else
695             {
696                 outputStream.put(line);
697                 outputStream.put("\n");
698 
699                 if (cmdopt.sampleSize != 0)
700                 {
701                     ++numLinesWritten;
702                     if (numLinesWritten == cmdopt.sampleSize) return;
703                 }
704 
705                 remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
706             }
707         }
708     }
709 }
710 
711 /** Sample lines by choosing a random set of distinct keys formed from one or more
712  * fields on each line.
713  *
714  * Distinct sampling is a streaming form of sampling, similar to Bernoulli sampling.
715  * However, instead of each line being subject to an independent trial, lines are
716  * selected based on a key from each line. A portion of keys are randomly selected for
717  * output, and every line containing a selected key is included in the output.
718  *
719  * An example use-case is a query log having <user, query, clicked-url> triples. It is
720  * often useful to sample records for portion of the users, but including all records
721  * for the users selected. Distinct sampling supports this by selecting a subset of
722  * users to include in the output.
723  *
724  * Distinct sampling is done by hashing the key and mapping the hash value into
725  * buckets sized to hold the inclusion probability. Records having a key mapping to
726  * bucket zero are output. Buckets are equal size and therefore may be larger than the
727  * inclusion probability. (The other approach would be to have the caller specify the
728  * the number of buckets. More correct, but less convenient.)
729  */
730 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
731     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
732 if (isOutputRange!(OutputRange, char))
733 {
734     import std.algorithm : splitter;
735     import std.conv : to;
736     import std.digest.murmurhash;
737     import std.math : lrint;
738     import tsv_utils.common.utils : bufferedByLine, InputFieldReordering, throwIfWindowsNewlineOnUnix;
739 
740     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
741     else assert(!cmdopt.genRandomInorder);
742 
743     assert(cmdopt.keyFields.length > 0);
744     assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0);
745 
746     static if (generateRandomAll)
747     {
748         import std.format : formatValue, singleSpec;
749         immutable randomValueFormatSpec = singleSpec("%d");
750     }
751 
752     immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys.
753 
754     uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint;
755 
756     /* Create a mapping for the key fields. */
757     auto keyFieldsReordering = cmdopt.distinctKeyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields);
758 
759     /* Process each line. */
760     bool headerWritten = false;
761     ulong numLinesWritten = 0;
762     foreach (filename; cmdopt.files)
763     {
764         auto inputStream = (filename == "-") ? stdin : filename.File();
765         foreach (ulong fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1))
766         {
767             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
768             if (fileLineNum == 1 && cmdopt.hasHeader)
769             {
770                 if (!headerWritten)
771                 {
772                     static if (generateRandomAll)
773                     {
774                         outputStream.put(cmdopt.randomValueHeader);
775                         outputStream.put(cmdopt.delim);
776                     }
777                     else if (cmdopt.printRandom)
778                     {
779                         outputStream.put(cmdopt.randomValueHeader);
780                         outputStream.put(cmdopt.delim);
781                     }
782 
783                     outputStream.put(line);
784                     outputStream.put("\n");
785                     headerWritten = true;
786                 }
787             }
788             else
789             {
790                 /* Murmurhash works by successively adding individual keys, then finalizing.
791                  * Adding individual keys is simpler if the full-line-as-key and individual
792                  * fields as keys cases are separated.
793                  */
794                 auto hasher = MurmurHash3!32(cmdopt.seed);
795 
796                 if (cmdopt.distinctKeyIsFullLine)
797                 {
798                     hasher.put(cast(ubyte[]) line);
799                 }
800                 else
801                 {
802                     assert(keyFieldsReordering !is null);
803 
804                     /* Gather the key field values and assemble the key. */
805                     keyFieldsReordering.initNewLine;
806                     foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
807                     {
808                         keyFieldsReordering.processNextField(fieldIndex, fieldValue);
809                         if (keyFieldsReordering.allFieldsFilled) break;
810                     }
811 
812                     if (!keyFieldsReordering.allFieldsFilled)
813                     {
814                         import std.format : format;
815                         throw new Exception(
816                             format("Not enough fields in line. File: %s, Line: %s",
817                                    (filename == "-") ? "Standard Input" : filename, fileLineNum));
818                     }
819 
820                     foreach (count, key; keyFieldsReordering.outputFields.enumerate)
821                     {
822                         if (count > 0) hasher.put(delimArray);
823                         hasher.put(cast(ubyte[]) key);
824                     }
825                 }
826 
827                 hasher.finish;
828 
829                 static if (generateRandomAll)
830                 {
831                     import std.conv : to;
832                     outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec);
833                     outputStream.put(cmdopt.delim);
834                     outputStream.put(line);
835                     outputStream.put("\n");
836 
837                     if (cmdopt.sampleSize != 0)
838                     {
839                         ++numLinesWritten;
840                         if (numLinesWritten == cmdopt.sampleSize) return;
841                     }
842                 }
843                 else if (hasher.get % numBuckets == 0)
844                 {
845                     if (cmdopt.printRandom)
846                     {
847                         outputStream.put('0');
848                         outputStream.put(cmdopt.delim);
849                     }
850                     outputStream.put(line);
851                     outputStream.put("\n");
852 
853                     if (cmdopt.sampleSize != 0)
854                     {
855                         ++numLinesWritten;
856                         if (numLinesWritten == cmdopt.sampleSize) return;
857                     }
858                 }
859             }
860         }
861     }
862 }
863 
864 /** Random sampling command handler. Invokes the appropriate sampling routine based on
865  * the command line arguments.
866  *
867  * Random sampling selects a fixed size random sample from the input stream. Both
868  * simple random sampling (equal likelihood) and weighted random sampling are
869  * supported. Selected lines are output either in random order or original input order.
870  * For weighted sampling the random order is the weighted selection order.
871  *
872  * Two algorithms are used, reservoir sampling via a heap and reservoir sampling via
873  * Algorithm R. This routine selects the appropriate reservoir sampling function and
874  * template instantiation to based on the command line arguments.
875  *
876  * Weighted sampling always uses the heap approach. Compatibility mode does as well,
877  * as it is the method that uses per-line random value assignments. The implication
878  * of compatibility mode is that a larger sample size includes all the results from
879  * a smaller sample, assuming the same random seed is used.
880  *
881  * For unweighted sampling there is a performance tradeoff between implementations.
882  * Heap-based sampling is faster for small sample sizes. Algorithm R is faster for
883  * large sample sizes. The threshold used was chosen based on performance tests. See
884  * the reservoirSamplingAlgorithmR documentation for more information.
885  */
886 
887 void randomSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
888 if (isOutputRange!(OutputRange, char))
889 {
890     assert(cmdopt.sampleSize != 0);
891 
892     immutable size_t algorithmRSampleSizeThreshold = 128 * 1024;
893 
894     if (cmdopt.hasWeightField)
895     {
896         if (cmdopt.preserveInputOrder)
897         {
898             reservoirSamplingViaHeap!(Yes.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream);
899         }
900         else
901         {
902             reservoirSamplingViaHeap!(Yes.isWeighted, No.preserveInputOrder)(cmdopt, outputStream);
903         }
904     }
905     else if (cmdopt.compatibilityMode ||
906              (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR))
907     {
908         if (cmdopt.preserveInputOrder)
909         {
910             reservoirSamplingViaHeap!(No.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream);
911         }
912         else
913         {
914             reservoirSamplingViaHeap!(No.isWeighted, No.preserveInputOrder)(cmdopt, outputStream);
915         }
916     }
917     else if (cmdopt.preserveInputOrder)
918     {
919         reservoirSamplingAlgorithmR!(Yes.preserveInputOrder)(cmdopt, outputStream);
920     }
921     else
922     {
923         reservoirSamplingAlgorithmR!(No.preserveInputOrder)(cmdopt, outputStream);
924     }
925 }
926 
927 /** Reservoir sampling using a heap. Both weighted and unweighted random sampling are
928  * supported.
929  *
930  * The algorithm used here is based on the one-pass algorithm described by Pavlos
931  * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S.
932  * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are
933  * simply set to one.
934  *
935  * The implementation uses a heap (priority queue) large enough to hold the desired
936  * number of lines. Input is read line-by-line, assigned a random value, and added to
937  * the heap. The role of the heap is to identify the lines with the highest assigned
938  * random values. Once the heap is full, adding a new line means dropping the line with
939  * the lowest score. A "min" heap used for this reason.
940  *
941  * When done reading all lines, the "min" heap is in reverse of weighted selection
942  * order. Weighted selection order is obtained by removing each element one at at time
943  * from the heap. The underlying data store will have the elements in weighted selection
944  * order (largest weights first).
945  *
946  * Generating output in weighted order is useful for several reasons:
947  *  - For weighted sampling, it preserves the property that smaller valid subsets can be
948  *    created by taking the first N lines.
949  *  - For unweighted sampling, it ensures that all output permutations are possible, and
950  *    are not influenced by input order or the heap data structure used.
951  *  - Order consistency is maintained when making repeated use of the same random seed,
952  *    but with different sample sizes.
953  *
954  * The other choice is preserving input order. This is supporting by recording line
955  * numbers and sorting the selected sample.
956  *
957  * There are use cases where only the selection set matters. For these some performance
958  * could be gained by skipping the reordering and simply printing the backing store
959  * array in-order. Performance tests indicate only a minor benefit, so this is not
960  * supported.
961  *
962  * Notes:
963  * $(LIST
964  *    * In tsv-sample versions 1.2.1 and earlier this routine also supported
965  *      randomization of all input lines. This was dropped in version 1.2.2 in favor
966  *      of the approach used in randomizeLines. The latter has significant advantages
967  *      given that all data must be read into memory.
968  *    * For large reservoir sizes better performance can be achieved using Algorithm R.
969  *      See the reservoirSamplingAlgorithmR documentation for details.
970  * )
971  */
972 void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, Flag!"preserveInputOrder" preserveInputOrder, OutputRange)
973     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
974 if (isOutputRange!(OutputRange, char))
975 {
976     import std.algorithm : sort;
977     import std.container.array;
978     import std.container.binaryheap;
979     import std.meta : AliasSeq;
980     import std.random : Random = Mt19937, uniform01;
981     import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix;
982 
983     static if (isWeighted) assert(cmdopt.hasWeightField);
984     else assert(!cmdopt.hasWeightField);
985 
986     assert(cmdopt.sampleSize > 0);
987 
988     auto randomGenerator = Random(cmdopt.seed);
989 
990     static struct Entry(Flag!"preserveInputOrder" preserveInputOrder)
991     {
992         double score;
993         const(char)[] line;
994         static if (preserveInputOrder) ulong lineNumber;
995     }
996 
997     /* Create the heap and backing data store.
998      *
999      * Note: An std.container.array is used as the backing store to avoid some issues in
1000      * the standard library (Phobos) binaryheap implementation. Specifically, when an
1001      * std.container.array is used as backing store, the heap can efficiently reversed by
1002      * removing the heap elements. This leaves the backing store in the reversed order.
1003      * However, the current binaryheap implementation does not support this for all
1004      * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094.
1005      */
1006 
1007     Array!(Entry!preserveInputOrder) dataStore;
1008     dataStore.reserve(cmdopt.sampleSize);
1009     auto reservoir = dataStore.heapify!("a.score > b.score")(0);  // Min binaryheap
1010 
1011     /* Process each line. */
1012     bool headerWritten = false;
1013     static if (preserveInputOrder) ulong totalLineNum = 0;
1014     foreach (filename; cmdopt.files)
1015     {
1016         auto inputStream = (filename == "-") ? stdin : filename.File();
1017         foreach (ulong fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1))
1018         {
1019             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
1020             if (fileLineNum == 1 && cmdopt.hasHeader)
1021             {
1022                 if (!headerWritten)
1023                 {
1024                     if (cmdopt.printRandom)
1025                     {
1026                         outputStream.put(cmdopt.randomValueHeader);
1027                         outputStream.put(cmdopt.delim);
1028                     }
1029                     outputStream.put(line);
1030                     outputStream.put("\n");
1031                     headerWritten = true;
1032                 }
1033             }
1034             else
1035             {
1036                 static if (!isWeighted)
1037                 {
1038                     immutable double lineScore = uniform01(randomGenerator);
1039                 }
1040                 else
1041                 {
1042                     immutable double lineWeight =
1043                         getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, filename, fileLineNum);
1044                     immutable double lineScore =
1045                         (lineWeight > 0.0)
1046                         ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1047                         : 0.0;
1048                 }
1049 
1050                 static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum);
1051                 else alias entryCTArgs = AliasSeq!();
1052 
1053                 if (reservoir.length < cmdopt.sampleSize)
1054                 {
1055                     reservoir.insert(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs));
1056                 }
1057                 else if (reservoir.front.score < lineScore)
1058                 {
1059                     reservoir.replaceFront(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs));
1060                 }
1061 
1062                 static if (preserveInputOrder) ++totalLineNum;
1063             }
1064         }
1065     }
1066 
1067     /* Done with input, all entries are in the reservoir. */
1068 
1069     /* The asserts here avoid issues with the current binaryheap implementation. They
1070      * detect use of backing stores having a length not synchronized to the reservoir.
1071      */
1072     immutable ulong numLines = reservoir.length;
1073     assert(numLines == dataStore.length);
1074 
1075     /* Update the backing store so it is in the desired output order.
1076      */
1077     static if (preserveInputOrder)
1078     {
1079         dataStore[].sort!((a, b) => a.lineNumber < b.lineNumber);
1080     }
1081     else
1082     {
1083         /* Output in weighted selection order. The heap is in reverse order of assigned
1084          * weights. Reversing order is done by removing all elements from the heap. This
1085          * leaves the backing store in the correct order.
1086          */
1087         while (!reservoir.empty) reservoir.removeFront;
1088     }
1089 
1090     assert(numLines == dataStore.length);
1091 
1092     foreach (entry; dataStore)
1093     {
1094         if (cmdopt.printRandom)
1095         {
1096             outputStream.formatRandomValue(entry.score);
1097             outputStream.put(cmdopt.delim);
1098         }
1099         outputStream.put(entry.line);
1100         outputStream.put("\n");
1101     }
1102  }
1103 
1104 /** Generate weighted random values for all input lines, preserving input order.
1105  *
1106  * This complements weighted reservoir sampling, but instead of using a reservoir it
1107  * simply iterates over the input lines generating the values. The weighted random
1108  * values are generated with the same formula used by reservoirSampling.
1109  */
1110 void generateWeightedRandomValuesInorder(OutputRange)
1111     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1112 if (isOutputRange!(OutputRange, char))
1113 {
1114     import std.random : Random = Mt19937, uniform01;
1115     import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix;
1116 
1117     assert(cmdopt.hasWeightField);
1118 
1119     auto randomGenerator = Random(cmdopt.seed);
1120 
1121     /* Process each line. */
1122     bool headerWritten = false;
1123     ulong numLinesWritten = 0;
1124     foreach (filename; cmdopt.files)
1125     {
1126         auto inputStream = (filename == "-") ? stdin : filename.File();
1127         foreach (ulong fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1))
1128         {
1129             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
1130             if (fileLineNum == 1 && cmdopt.hasHeader)
1131             {
1132                 if (!headerWritten)
1133                 {
1134                     outputStream.put(cmdopt.randomValueHeader);
1135                     outputStream.put(cmdopt.delim);
1136                     outputStream.put(line);
1137                     outputStream.put("\n");
1138                     headerWritten = true;
1139                 }
1140             }
1141             else
1142                {
1143                 immutable double lineWeight =
1144                     getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, filename, fileLineNum);
1145 
1146                 immutable double lineScore =
1147                     (lineWeight > 0.0)
1148                     ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1149                     : 0.0;
1150 
1151                 outputStream.formatRandomValue(lineScore);
1152                 outputStream.put(cmdopt.delim);
1153                 outputStream.put(line);
1154                 outputStream.put("\n");
1155 
1156                 if (cmdopt.sampleSize != 0)
1157                 {
1158                     ++numLinesWritten;
1159                     if (numLinesWritten == cmdopt.sampleSize) return;
1160                 }
1161             }
1162         }
1163     }
1164 }
1165 
1166 /** Reservoir sampling via Algorithm R
1167  *
1168  * This is an implementation of reservoir sampling using what is commonly known as
1169  * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of
1170  * Computer Programming, Volume 2: Seminumerical Algorithms". More information about
1171  * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with
1172  * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling"
1173  * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R).
1174  *
1175  * Algorithm R is used for unweighted sampling without replacement. The heap-based
1176  * algorithm in reservoirSamplingViaHeap is used for weighted sampling.
1177  *
1178  * The classic algorithm stops after identifying the selected set of items. This
1179  * implementation goes one step further and randomizes the order of the selected
1180  * lines. This is consistent with shuffling (line order randomization), a primary
1181  * tsv-sample use-case.
1182  *
1183  * This algorithm is faster than reservoirSamplingViaHeap when the sample size
1184  * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size.
1185  * Insertion in this algorithm is O(1). Similarly, generating the random order in the
1186  * heap is O(k * log k), while in this algorithm the final randomization step is O(k).
1187  *
1188  * This speed advantage may be offset a certain amount by using a more expensive random
1189  * value generator. reservoirSamplingViaHeap generates values between zero and one,
1190  * whereas reservoirSamplingAlgorithmR generates random integers over and ever growing
1191  * interval. The latter is expected to be more expensive. This is consistent with
1192  * performance tests indicating that reservoirSamplingViaHeap is faster when using
1193  * small-to-medium size reservoirs and large input streams.
1194  */
1195 void reservoirSamplingAlgorithmR(Flag!"preserveInputOrder" preserveInputOrder, OutputRange)
1196     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1197 if (isOutputRange!(OutputRange, char))
1198 {
1199     import std.meta : AliasSeq;
1200     import std.random : Random = Mt19937, randomShuffle, uniform;
1201     import std.algorithm : sort;
1202     import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix;
1203 
1204     assert(cmdopt.sampleSize > 0);
1205     assert(!cmdopt.hasWeightField);
1206     assert(!cmdopt.compatibilityMode);
1207     assert(!cmdopt.printRandom);
1208     assert(!cmdopt.genRandomInorder);
1209 
1210     static struct Entry(Flag!"preserveInputOrder" preserveInputOrder)
1211     {
1212         const(char)[] line;
1213         static if (preserveInputOrder) ulong lineNumber;
1214     }
1215 
1216     Entry!preserveInputOrder[] reservoir;
1217     auto reservoirAppender = appender(&reservoir);
1218     reservoirAppender.reserve(cmdopt.sampleSize);
1219 
1220     auto randomGenerator = Random(cmdopt.seed);
1221 
1222     /* Process each line. */
1223 
1224     bool headerWritten = false;
1225     ulong totalLineNum = 0;
1226     foreach (filename; cmdopt.files)
1227     {
1228         auto inputStream = (filename == "-") ? stdin : filename.File();
1229         foreach (ulong fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1))
1230         {
1231             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
1232             if (fileLineNum == 1 && cmdopt.hasHeader)
1233             {
1234                 if (!headerWritten)
1235                 {
1236                     outputStream.put(line);
1237                     outputStream.put("\n");
1238                     headerWritten = true;
1239                 }
1240             }
1241             else
1242             {
1243                 /* Add lines to the reservoir until the reservoir is filled.
1244                  * After that lines are added with decreasing likelihood, based on
1245                  * the total number of lines seen. If added to the reservoir, the
1246                  * line replaces a randomly chosen existing line.
1247                  */
1248                 static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum);
1249                 else alias entryCTArgs = AliasSeq!();
1250 
1251                 if (totalLineNum < cmdopt.sampleSize)
1252                 {
1253                     reservoirAppender ~= Entry!preserveInputOrder(line.idup, entryCTArgs);
1254                 }
1255                 else
1256                 {
1257                     immutable size_t i = uniform(0, totalLineNum, randomGenerator);
1258                     if (i < reservoir.length)
1259                     {
1260                         reservoir[i] = Entry!preserveInputOrder(line.idup, entryCTArgs);
1261                     }
1262                 }
1263 
1264                 ++totalLineNum;
1265             }
1266         }
1267     }
1268 
1269     /* Done with input. The sample is in the reservoir. Update the order and print. */
1270 
1271     static if (preserveInputOrder)
1272     {
1273         reservoir.sort!((a, b) => a.lineNumber < b.lineNumber);
1274     }
1275     else
1276     {
1277         reservoir.randomShuffle(randomGenerator);
1278     }
1279 
1280     foreach (ref entry; reservoir)
1281     {
1282         outputStream.put(entry.line);
1283         outputStream.put("\n");
1284     }
1285 }
1286 
1287 /** Shuffling command handler. Invokes the appropriate shuffle (line order
1288  * randomization) routine based on the command line arguments.
1289  *
1290  * Shuffling has similarities to random sampling, but the algorithms used are
1291  * different. Random sampling selects a subset, only the current subset selection
1292  * needs to be kept in memory. This is supported by reservoir sampling. By contrast,
1293  * shuffling needs to hold all input in memory, so it works better to read all lines
1294  * into memory at once and then shuffle.
1295  *
1296  * Two different algorithms are used. Array shuffling is used for unweighted shuffling.
1297  * Sorting plus random weight assignments is used for weighted shuffling and when
1298  * compatibility mode is being used.
1299  *
1300  * The algorithms used here are all limited by available memory.
1301  */
1302 void shuffleCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1303 if (isOutputRange!(OutputRange, char))
1304 {
1305     if (cmdopt.hasWeightField)
1306     {
1307         randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream);
1308     }
1309     else if (cmdopt.compatibilityMode)
1310     {
1311         randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream);
1312     }
1313     else
1314     {
1315         randomizeLinesViaShuffle(cmdopt, outputStream);
1316     }
1317 }
1318 
1319 /** Shuffle all input lines by assigning random weights and sorting.
1320  *
1321  * randomizeLinesViaSort reads in all input lines and writes them out in random order.
1322  * The algorithm works by assigning a random value to each line and sorting. Both
1323  * weighted and unweighted shuffling are supported.
1324  *
1325  * Notes:
1326  * $(LIST
1327  *   * For unweighted shuffling randomizeLinesViaShuffle is faster and should be used
1328  *     unless compatibility mode is needed.
1329  *   * This routine is significantly faster than heap-based reservoir sampling in the
1330  *     case where the entire file is being read.
1331  *   * Input data must be read entirely in memory. Disk oriented techniques are needed
1332  *     when data sizes get too large for available memory. One option is to generate
1333  *     random values for each line, e.g. --gen-random-inorder, and sort with a disk-
1334  *     backed sort program like GNU sort.
1335  * )
1336  */
1337 void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange)
1338     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1339 if (isOutputRange!(OutputRange, char))
1340 {
1341     import std.algorithm : map, sort;
1342 
1343     static if (isWeighted) assert(cmdopt.hasWeightField);
1344     else assert(!cmdopt.hasWeightField);
1345 
1346     assert(cmdopt.sampleSize == 0);
1347 
1348     /*
1349      * Read all file data into memory. Then split the data into lines and assign a
1350      * random value to each line. identifyInputLines also writes the first header line.
1351      */
1352     const fileData = cmdopt.files.readFileData;
1353     auto inputLines = fileData.identifyInputLines!(Yes.hasRandomValue, isWeighted)(cmdopt, outputStream);
1354 
1355     /*
1356      * Sort by the weight and output the lines.
1357      */
1358     inputLines.sort!((a, b) => a.randomValue > b.randomValue);
1359 
1360     foreach (lineEntry; inputLines)
1361     {
1362         if (cmdopt.printRandom)
1363         {
1364             outputStream.formatRandomValue(lineEntry.randomValue);
1365             outputStream.put(cmdopt.delim);
1366         }
1367         outputStream.put(lineEntry.data);
1368         outputStream.put("\n");
1369     }
1370 }
1371 
1372 /** Shuffle (randomize) all input lines using a shuffling algorithm.
1373  *
1374  * All lines in files and/or standard input are read in and written out in random
1375  * order. This routine uses array shuffling, which is faster than sorting. It is a
1376  * good alternative to randomizeLinesViaSort when doing unweighted shuffling (the
1377  * most common case).
1378  *
1379  * Input data size is limited by available memory. Disk oriented techniques are needed
1380  * when data sizes are larger. For example, generating random values line-by-line (ala
1381  * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort.
1382  *
1383  * This routine does not support random value printing or compatibility-mode.
1384  */
1385 void randomizeLinesViaShuffle(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1386 if (isOutputRange!(OutputRange, char))
1387 {
1388     import std.algorithm : map;
1389     import std.random : Random = Mt19937, randomShuffle;
1390 
1391     assert(cmdopt.sampleSize == 0);
1392     assert(!cmdopt.hasWeightField);
1393     assert(!cmdopt.printRandom);
1394     assert(!cmdopt.genRandomInorder);
1395 
1396     /*
1397      * Read all file data into memory and split into lines.
1398      */
1399     const fileData = cmdopt.files.readFileData;
1400     auto inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream);
1401 
1402     /*
1403      * Randomly shuffle and print each line.
1404      *
1405      * Note: Also tried randomCover, but that was exceedingly slow.
1406      */
1407     import std.random : randomShuffle;
1408 
1409     auto randomGenerator = Random(cmdopt.seed);
1410     inputLines.randomShuffle(randomGenerator);
1411 
1412     foreach (ref line; inputLines)
1413     {
1414         outputStream.put(line.data);
1415         outputStream.put("\n");
1416     }
1417 }
1418 
1419 /** Simple random sampling with replacement.
1420  *
1421  * All lines in files and/or standard input are read in. Then random lines are selected
1422  * one at a time and output. Lines can be selected multiple times. This process continues
1423  * until the desired number of samples (--n|num) has been output. Output continues
1424  * indefinitely if a sample size was not provided.
1425  */
1426 void simpleRandomSamplingWithReplacement(OutputRange)
1427     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1428 if (isOutputRange!(OutputRange, char))
1429 {
1430     import std.algorithm : map;
1431     import std.random : Random = Mt19937, uniform;
1432 
1433     /*
1434      * Read all file data into memory and split the data into lines.
1435      */
1436     const fileData = cmdopt.files.readFileData;
1437     const inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream);
1438 
1439     if (inputLines.length > 0)
1440     {
1441         auto randomGenerator = Random(cmdopt.seed);
1442 
1443         /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */
1444         size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize;
1445         while (numLeft != 0)
1446         {
1447             immutable size_t index = uniform(0, inputLines.length, randomGenerator);
1448             outputStream.put(inputLines[index].data);
1449             outputStream.put("\n");
1450             if (cmdopt.sampleSize != 0) numLeft--;
1451         }
1452     }
1453 }
1454 
1455 /** A container holding data read from a file or standard input.
1456  *
1457  * The InputBlock struct is used to represent a block of data read from a file or
1458  * standard input. An array of InputBlocks is returned by readFileData. Typically one
1459  * block per file. Multiple blocks are used for standard input and when the file size
1460  * cannot be determined. Individual lines are not allowed to span blocks. The blocks
1461  * allocated to an individual file are numbered starting with zero.
1462  *
1463  * See readFileData() for more information.
1464  */
1465 static struct InputBlock
1466 {
1467     string filename;          /// Original filename or path. "-" denotes standard input.
1468     size_t fileBlockNumber;   /// Zero-based block number for the file.
1469     char[] data;              /// The actual data. Newline terminated or last block for the file.
1470 }
1471 
1472 /** Read data from one or more files. This routine is used by algorithms needing to
1473  * read all data into memory.
1474  *
1475  * readFileData reads in all data from a set of files. Data is returned as an array
1476  * of InputBlock structs. Normally one InputBlock per file, sized to match the size
1477  * of the file. Standard input is read in one or more blocks, as are files whose size
1478  * cannot be determined. Multiple blocks are used in these last two cases to avoid
1479  * expensive memory reallocations. This is not necessary when file size is known as
1480  * the necessary memory can be preallocated.
1481  *
1482  * Individual lines never span multiple blocks, and newlines are preserved. This
1483  * means that each block starts at the beginning of a line and ends with a newline
1484  * unless the end of a file has been reached. Each file gets its own block so that
1485  * header processing can be done.
1486  */
1487 InputBlock[] readFileData(const string[] files)
1488 {
1489     import std.algorithm : find, min;
1490     import std.range : retro;
1491 
1492     enum BlockSize = 1024L * 1024L * 1024L;  // 1 GB. ('L' notation avoids overflow w/ 2GB+ sizes.)
1493     enum ReadSize = 1024L * 128L;
1494     enum NewlineSearchSize = 1024L * 16L;
1495 
1496     InputBlock[] blocks;
1497     auto blocksAppender = appender(&blocks);
1498     blocksAppender.reserve(files.length);  // At least one block per file.
1499 
1500     ubyte[] rawReadBuffer = new ubyte[ReadSize];
1501 
1502     foreach (filename; files)
1503     {
1504         /* If the file size can be determined then read it as a single block.
1505          * Otherwise read as multiple blocks. File.size() returns ulong.max
1506          * if file size cannot be determined, so we'll combine that check
1507          * with the standard input case.
1508          */
1509 
1510         auto ifile = (filename == "-") ? stdin : filename.File;
1511         immutable ulong filesize = (filename == "-") ? ulong.max : ifile.size;
1512 
1513         if (filesize != ulong.max)
1514         {
1515             readFileDataAsOneBlock(filename, ifile, filesize, blocksAppender, rawReadBuffer);
1516         }
1517         else
1518         {
1519             readFileDataAsMultipleBlocks(filename, ifile, blocksAppender, rawReadBuffer,
1520                                          BlockSize, NewlineSearchSize);
1521         }
1522     }
1523     return blocks;
1524 }
1525 
1526 /* readFileData() helper function. Read data from a File handle as a single block. The
1527  * new block is appended to an existing InputBlock[] array.
1528  *
1529  * readFileDataAsOneBlocks is part of the readFileData logic. It handles the case
1530  * where a file is being read as a single block. Normally initialBlockSize is passed
1531  * as the size of the file.
1532  *
1533  * This routine has been separated out to enable unit testing. At present it is not
1534  * intended as a general API. See readFileData for more info.
1535  */
1536 private void readFileDataAsOneBlock(
1537     string filename,
1538     ref File ifile,
1539     const ulong initialBlockSize,
1540     ref RefAppender!(InputBlock[]) blocksAppender,
1541     ref ubyte[] rawReadBuffer)
1542 {
1543     blocksAppender.put(InputBlock(filename, 0));
1544     auto dataAppender = appender(&(blocksAppender.data[$-1].data));
1545     dataAppender.reserve(initialBlockSize);
1546 
1547     foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer))
1548     {
1549         dataAppender.put(cast(char[]) buffer);
1550     }
1551 }
1552 
1553 /* readFileData() helper function. Read data from a File handle as one or more blocks.
1554  * Blocks are appended to an existing InputBlock[] array.
1555  *
1556  * readFileDataAsMultipleBlocks is part of the readFileData logic. It handles the case
1557  * where a file or standard input is being read as a series of blocks. This is the
1558  * standard approach for standard input, but also applies when the file size cannot be
1559  * determined.
1560  *
1561  * This routine has been separated out to enable unit testing. At present it is not
1562  * intended as a general API. See readFileData for more info.
1563  */
1564 private void readFileDataAsMultipleBlocks(
1565     string filename,
1566     ref File ifile,
1567     ref RefAppender!(InputBlock[]) blocksAppender,
1568     ref ubyte[] rawReadBuffer,
1569     const size_t blockSize,
1570     const size_t newlineSearchSize)
1571 {
1572     import std.algorithm : find, min;
1573     import std.range : retro;
1574 
1575     assert(ifile.isOpen);
1576 
1577     /* Create a new block for the file and an Appender for writing data.
1578      */
1579     blocksAppender.put(InputBlock(filename, 0));
1580     auto dataAppender = appender(&(blocksAppender.data[$-1].data));
1581     dataAppender.reserve(blockSize);
1582     size_t blockNumber = 0;
1583 
1584     /* Read all the data and copy it to an InputBlock. */
1585     foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer))
1586     {
1587         assert(blockNumber == blocksAppender.data[$-1].fileBlockNumber);
1588 
1589         immutable size_t remainingCapacity = dataAppender.capacity - dataAppender.data.length;
1590 
1591         if (buffer.length <= remainingCapacity)
1592         {
1593             dataAppender.put(cast(char[]) buffer);
1594         }
1595         else
1596         {
1597             /* Look for the last newline in the input buffer that fits in remaining
1598              * capacity of the block.
1599              */
1600             auto searchRegion = buffer[0 .. remainingCapacity];
1601             auto appendRegion = searchRegion.retro.find('\n').source;
1602 
1603             if (appendRegion.length > 0)
1604             {
1605                 /* Copy the first part of the read buffer to the block. */
1606                 dataAppender.put(cast(char[]) appendRegion);
1607 
1608                 /* Create a new InputBlock and copy the remaining data to it. */
1609                 blockNumber++;
1610                 blocksAppender.put(InputBlock(filename, blockNumber));
1611                 dataAppender = appender(&(blocksAppender.data[$-1].data));
1612                 dataAppender.reserve(blockSize);
1613                 dataAppender.put(cast(char[]) buffer[appendRegion.length .. $]);
1614 
1615                 assert(blocksAppender.data.length >= 2);
1616                 assert(blocksAppender.data[$-2].data[$-1] == '\n');
1617             }
1618             else
1619             {
1620                 /* Search backward in the current block for a newline. If found, it
1621                  * becomes the last newline in the current block. Anything following
1622                  * it is moved to the block. If a newline is not found, simply append
1623                  * to the current block and let it grow. We'll only search backward
1624                  * so far.
1625                  */
1626                 immutable size_t currBlockLength = blocksAppender.data[$-1].data.length;
1627                 immutable size_t searchLength = min(currBlockLength, newlineSearchSize);
1628                 immutable size_t searchStart = currBlockLength - searchLength;
1629                 auto blockSearchRegion = blocksAppender.data[$-1].data[searchStart .. $];
1630                 auto lastNewlineOffset = blockSearchRegion.retro.find('\n').source.length;
1631 
1632                 if (lastNewlineOffset != 0)
1633                 {
1634                     /* Create a new InputBlock. The previous InputBlock is then found
1635                      * at blocksAppender.data[$-2]. It may be a physically different
1636                      * struct (a copy) if the blocks array gets reallocated.
1637                      */
1638                     blockNumber++;
1639                     blocksAppender.put(InputBlock(filename, blockNumber));
1640                     dataAppender = appender(&(blocksAppender.data[$-1].data));
1641                     dataAppender.reserve(blockSize);
1642 
1643                     /* Copy data following the newline from the last block to the new
1644                      * block. Then append the current read buffer.
1645                      */
1646                     immutable size_t moveRegionStart = searchStart + lastNewlineOffset;
1647                     dataAppender.put(blocksAppender.data[$-2].data[moveRegionStart .. $]);
1648                     dataAppender.put(cast(char[]) buffer);
1649 
1650                     /* Now delete the moved region from the last block. */
1651                     blocksAppender.data[$-2].data.length = moveRegionStart;
1652 
1653                     assert(blocksAppender.data.length >= 2);
1654                     assert(blocksAppender.data[$-2].data[$-1] == '\n');
1655                 }
1656                 else
1657                 {
1658                     /* Give up. Allow the current block to grow. */
1659                     dataAppender.put(cast(char[]) buffer);
1660                 }
1661             }
1662         }
1663     }
1664 }
1665 
1666 /** HasRandomValue is a boolean flag used at compile time by identifyInputLines to
1667  * distinguish use cases needing random value assignments from those that don't.
1668  */
1669 alias HasRandomValue = Flag!"hasRandomValue";
1670 
1671 /** An InputLine array is returned by identifyInputLines to represent each non-header line
1672  * line found in a FileData array. The 'data' element contains the line. A 'randomValue'
1673  * line is included if random values are being generated.
1674  */
1675 static struct InputLine(HasRandomValue hasRandomValue)
1676 {
1677     const(char)[] data;
1678     static if (hasRandomValue) double randomValue;
1679 }
1680 
1681 /** identifyInputLines is used by algorithms that read all files into memory prior to
1682  * processing. It does the initial processing of the file data.
1683  *
1684  * Three primary tasks are performed. One is splitting all input data into lines. The
1685  * second is writing the header line from the first file to the output stream. Header
1686  * lines from subsequent files are ignored. Third is assigning a random value to the
1687  * line, if random values are being generated.
1688  *
1689  * The key input is an InputBlock array. Normally one block for each file, but standard
1690  * input may have multiple blocks.
1691  *
1692  * The return value is an array of InputLine structs. The struct will have a 'randomValue'
1693  * member if random values are being assigned.
1694  */
1695 InputLine!hasRandomValue[] identifyInputLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted, OutputRange)
1696 (const ref InputBlock[] inputBlocks, TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1697 if (isOutputRange!(OutputRange, char))
1698 {
1699     import std.algorithm : splitter;
1700     import std.array : appender;
1701     import std.random : Random = Mt19937, uniform01;
1702     import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
1703 
1704     static assert(hasRandomValue || !isWeighted);
1705     static if(!hasRandomValue) assert(!cmdopt.printRandom);
1706 
1707     InputLine!hasRandomValue[] inputLines;
1708 
1709     auto linesAppender = appender(&inputLines);
1710     static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed);
1711     bool headerWritten = false;
1712     size_t fileLineNum;
1713 
1714     foreach (block; inputBlocks)
1715     {
1716         /* Drop the last newline to avoid adding an extra empty line. */
1717         const data = (block.data.length > 0 && block.data[$-1] == '\n') ?
1718             block.data[0 .. $-1] : block.data;
1719 
1720         if (block.fileBlockNumber == 0) fileLineNum = 0;
1721 
1722         foreach (ref line; data.splitter('\n'))
1723         {
1724             fileLineNum++;
1725 
1726             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, block.filename, fileLineNum);
1727             if (fileLineNum == 1 && cmdopt.hasHeader)
1728             {
1729                 if (!headerWritten)
1730                 {
1731                     if (cmdopt.printRandom)
1732                     {
1733                         outputStream.put(cmdopt.randomValueHeader);
1734                         outputStream.put(cmdopt.delim);
1735                     }
1736                     outputStream.put(line);
1737                     outputStream.put("\n");
1738                     headerWritten = true;
1739                 }
1740             }
1741             else
1742             {
1743                 static if (!hasRandomValue)
1744                 {
1745                     linesAppender.put(InputLine!hasRandomValue(line));
1746                 }
1747                 else
1748                 {
1749                     static if (!isWeighted)
1750                     {
1751                         immutable double randomValue = uniform01(randomGenerator);
1752                     }
1753                     else
1754                     {
1755                         immutable double lineWeight =
1756                             getFieldValue!double(line, cmdopt.weightField, cmdopt.delim,
1757                                                  block.filename, fileLineNum);
1758                         immutable double randomValue =
1759                             (lineWeight > 0.0)
1760                             ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1761                             : 0.0;
1762                     }
1763 
1764                     linesAppender.put(InputLine!hasRandomValue(line, randomValue));
1765                 }
1766             }
1767         }
1768     }
1769 
1770     return inputLines;
1771 }
1772 
1773 /* Unit tests for ReadFileData. These tests focus on multiple InputBlock scenarios.
1774  * Other use paths are well tested by the tests at the end cases.
1775  */
1776 unittest
1777 {
1778     import tsv_utils.common.unittest_utils;
1779     import std.algorithm : equal, find, joiner, splitter;
1780     import std.array : appender;
1781     import std.file : rmdirRecurse;
1782     import std.format : format;
1783     import std.path : buildPath;
1784     import std.range : repeat;
1785 
1786     auto rfdTestDir = makeUnittestTempDir("tsv_sample_readFileData");
1787     scope(exit) rfdTestDir.rmdirRecurse;
1788 
1789     char[] file1Data;
1790     char[] file2Data;
1791     char[] file3Data;
1792 
1793     auto app1 = appender(&file1Data);
1794     auto app2 = appender(&file2Data);
1795     auto app3 = appender(&file3Data);
1796 
1797     /* File 1: 1000 short lines. */
1798     app1.put("\n".repeat(100).joiner);
1799     app1.put("x\n".repeat(100).joiner);
1800     app1.put("yz\n".repeat(100).joiner);
1801     app1.put("pqr\n".repeat(100).joiner);
1802     app1.put("a\nbc\ndef\n".repeat(100).joiner);
1803     app1.put('\n'.repeat(100));
1804     app1.put("z\n".repeat(100).joiner);
1805     app1.put("xy\n".repeat(100).joiner);
1806 
1807     /* File 2: 500 longer lines. */
1808     app2.put(
1809         "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n"
1810         .repeat(100)
1811         .joiner);
1812     app2.put(
1813         "|abcdefghijklmnopqrstuv|\n|0123456789|\n|0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ|\n|abcdefghijklmnopqrstuvwxyz|\n"
1814         .repeat(100)
1815         .joiner);
1816     app2.put(
1817          "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n"
1818         .repeat(100)
1819         .joiner);
1820 
1821     /* File 3: 1000 mixed length lines. */
1822     app3.put("\n\n|abcde|\n1\n12\n123\n|abcdefghijklmnop|\n|xyz|\n0123456789\nX\n".repeat(100).joiner);
1823 
1824     string file1Path = buildPath(rfdTestDir, "file1.txt");
1825     string file2Path = buildPath(rfdTestDir, "file2.txt");
1826     string file3Path = buildPath(rfdTestDir, "file3.txt");
1827 
1828     try
1829     {
1830         auto ofile1 = File(file1Path, "w");
1831         ofile1.write(file1Data);
1832     }
1833     catch (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file1Path, e.msg));
1834 
1835     try
1836     {
1837         auto ofile2 = File(file2Path, "w");
1838         ofile2.write(file2Data);
1839     }
1840     catch (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file2Path, e.msg));
1841 
1842     try
1843     {
1844         auto ofile3 = File(file3Path, "w");
1845         ofile3.write(file3Data);
1846     }
1847     catch  (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file3Path, e.msg));
1848 
1849     auto allData = file1Data ~ file2Data ~ file3Data;
1850     auto expectedLines = allData.splitter('\n').array[0 .. $-1];
1851 
1852     auto file2DataNoHeader = (file2Data.find('\n'))[1 .. $];
1853     auto file3DataNoHeader = (file3Data.find('\n'))[1 .. $];
1854     auto allDataUsingHeader = file1Data ~ file2DataNoHeader ~ file3DataNoHeader;
1855     auto expectedLinesUsingHeader = allDataUsingHeader.splitter('\n').array[0 .. $-1];
1856 
1857     assert(expectedLines.length == expectedLinesUsingHeader.length + 2);
1858 
1859     TsvSampleOptions cmdoptNoHeader;
1860     auto noHeaderCmdArgs = ["unittest"];
1861     auto r1 = cmdoptNoHeader.processArgs(noHeaderCmdArgs);
1862     assert(r1[0], format("Invalid command lines arg: '%s'.", noHeaderCmdArgs));
1863 
1864     TsvSampleOptions cmdoptYesHeader;
1865     auto yesHeaderCmdArgs = ["unittest", "--header"];
1866     auto r2 = cmdoptYesHeader.processArgs(yesHeaderCmdArgs);
1867     assert(r2[0], format("Invalid command lines arg: '%s'.", yesHeaderCmdArgs));
1868 
1869     auto outputStream = appender!(char[])();
1870 
1871     {
1872         /* Reading as single blocks. */
1873         ubyte[] rawReadBuffer = new ubyte[256];
1874         InputBlock[] blocks;
1875         auto blocksAppender = appender(&blocks);
1876         blocksAppender.reserve(3);
1877         foreach (f; [ file1Path, file2Path, file3Path ])
1878         {
1879             auto ifile = f.File;
1880             ulong filesize = ifile.size;
1881             if (filesize == ulong.max) filesize = 1000;
1882             readFileDataAsOneBlock(f, ifile, filesize, blocksAppender, rawReadBuffer);
1883         }
1884         auto inputLines =
1885             identifyInputLines!(No.hasRandomValue, No.isWeighted)(
1886                 blocks, cmdoptNoHeader, outputStream);
1887 
1888         assert(equal!((a, b) => a.data == b)(inputLines, expectedLines));
1889     }
1890 
1891     {
1892         /* Reading as multiple blocks. */
1893         foreach (size_t searchSize; [ 0, 1, 2, 64 ])
1894         {
1895             foreach (size_t blockSize; [ 1, 2, 16, 64, 256 ])
1896             {
1897                 foreach (size_t readSize; [ 1, 2, 8, 32 ])
1898                 {
1899                     ubyte[] rawReadBuffer = new ubyte[readSize];
1900                     InputBlock[] blocks;
1901                     auto blocksAppender = appender(&blocks);
1902                     blocksAppender.reserve(3);
1903                     foreach (f; [ file1Path, file2Path, file3Path ])
1904                     {
1905                         auto ifile = f.File;
1906                         readFileDataAsMultipleBlocks(f, ifile, blocksAppender,
1907                                                      rawReadBuffer, blockSize, searchSize);
1908                     }
1909                     auto inputLines =
1910                         identifyInputLines!(No.hasRandomValue, No.isWeighted)(
1911                             blocks, cmdoptNoHeader, outputStream);
1912 
1913                     assert(equal!((a, b) => a.data == b)(inputLines, expectedLines));
1914                 }
1915             }
1916         }
1917     }
1918     {
1919         /* Reading as multiple blocks, with header processing. */
1920         const size_t readSize = 32;
1921         const size_t blockSize = 48;
1922         const size_t searchSize = 16;
1923 
1924         ubyte[] rawReadBuffer = new ubyte[readSize];
1925         InputBlock[] blocks;
1926         auto blocksAppender = appender(&blocks);
1927         blocksAppender.reserve(3);
1928         foreach (f; [ file1Path, file2Path, file3Path ])
1929         {
1930             auto ifile = f.File;
1931             readFileDataAsMultipleBlocks(f, ifile, blocksAppender,
1932                                          rawReadBuffer, blockSize, searchSize);
1933         }
1934         auto inputLines =
1935             identifyInputLines!(No.hasRandomValue, No.isWeighted)(
1936                 blocks, cmdoptYesHeader, outputStream);
1937 
1938         assert(outputStream.data == expectedLinesUsingHeader[0] ~ '\n');
1939         assert(equal!((a, b) => a.data == b)(inputLines, expectedLinesUsingHeader[1 .. $]));
1940     }
1941 }
1942 
1943 /** Write a floating point random value to an output stream.
1944  *
1945  * This routine is used for floating point random value printing. This routine writes
1946  * 17 significant digits, the range available in doubles. This routine prefers decimal
1947  * format, without exponents. It will generate somewhat large precision numbers,
1948  * currently up to 28 digits, before switching to exponents.
1949  *
1950  * The primary reason for this approach is to enable faster sorting on random values
1951  * by GNU sort and similar external sorting programs. GNU sort is dramatically faster
1952  * on decimal format numeric sorts ('n' switch) than general numeric sorts ('g' switch).
1953  * The 'general numeric' handles exponential notation. The difference is 5-10x.
1954  *
1955  * Random values generated by Bernoulli sampling are nearly always greater than 1e-12.
1956  * No examples less than 1e-09 were seen in hundred of millions of trials. Similar
1957  * results were seen with weighted sampling with integer weights. The same is not true
1958  * with floating point weights. These produce quite large exponents. However, even
1959  * for floating point weights this can be useful. For random weights [0,1] less than 5%
1960  * will be less than 1e-12 and use exponential notation.
1961  */
1962 void formatRandomValue(OutputRange)(auto ref OutputRange outputStream, double value)
1963 if (isOutputRange!(OutputRange, char))
1964 {
1965     import std.format : formatValue, singleSpec;
1966 
1967     immutable spec17f = singleSpec("%.17f");
1968     immutable spec18f = singleSpec("%.18f");
1969     immutable spec19f = singleSpec("%.19f");
1970     immutable spec20f = singleSpec("%.20f");
1971     immutable spec21f = singleSpec("%.21f");
1972     immutable spec22f = singleSpec("%.22f");
1973     immutable spec23f = singleSpec("%.23f");
1974     immutable spec24f = singleSpec("%.24f");
1975     immutable spec25f = singleSpec("%.25f");
1976     immutable spec26f = singleSpec("%.26f");
1977     immutable spec27f = singleSpec("%.27f");
1978     immutable spec28f = singleSpec("%.28f");
1979 
1980     immutable spec17g = singleSpec("%.17g");
1981 
1982     immutable formatSpec =
1983         (value >= 1e-01) ? spec17f :
1984         (value >= 1e-02) ? spec18f :
1985         (value >= 1e-03) ? spec19f :
1986         (value >= 1e-04) ? spec20f :
1987         (value >= 1e-05) ? spec21f :
1988         (value >= 1e-06) ? spec22f :
1989         (value >= 1e-07) ? spec23f :
1990         (value >= 1e-08) ? spec24f :
1991         (value >= 1e-09) ? spec25f :
1992         (value >= 1e-10) ? spec26f :
1993         (value >= 1e-11) ? spec27f :
1994         (value >= 1e-12) ? spec28f : spec17g;
1995 
1996     outputStream.formatValue(value, formatSpec);
1997 }
1998 
1999 @safe unittest
2000 {
2001     void testFormatValue(double value, string expected)
2002     {
2003         import std.array : appender;
2004         import std.format : format;
2005 
2006         auto s = appender!string();
2007         s.formatRandomValue(value);
2008         assert(s.data == expected,
2009                format("[testFormatValue] value: %g; expected: %s; actual: %s", value, expected, s.data));
2010     }
2011 
2012     testFormatValue(1.0,   "1.00000000000000000");
2013     testFormatValue(0.1,   "0.10000000000000001");
2014     testFormatValue(0.01,  "0.010000000000000000");
2015     testFormatValue(1e-03, "0.0010000000000000000");
2016     testFormatValue(1e-04, "0.00010000000000000000");
2017     testFormatValue(1e-05, "0.000010000000000000001");
2018     testFormatValue(1e-06, "0.0000010000000000000000");
2019     testFormatValue(1e-07, "0.00000010000000000000000");
2020     testFormatValue(1e-08, "0.000000010000000000000000");
2021     testFormatValue(1e-09, "0.0000000010000000000000001");
2022     testFormatValue(1e-10, "0.00000000010000000000000000");
2023     testFormatValue(1e-11, "0.000000000009999999999999999");
2024     testFormatValue(1e-12, "0.0000000000010000000000000000");
2025     testFormatValue(1e-13, "1e-13");
2026     testFormatValue(1e-14, "1e-14");
2027     testFormatValue(12345678901234567e-15, "12.34567890123456735");
2028     testFormatValue(12345678901234567e-16, "1.23456789012345669");
2029     testFormatValue(12345678901234567e-17, "0.12345678901234566");
2030     testFormatValue(12345678901234567e-18, "0.012345678901234567");
2031     testFormatValue(12345678901234567e-19, "0.0012345678901234567");
2032     testFormatValue(12345678901234567e-20, "0.00012345678901234567");
2033     testFormatValue(12345678901234567e-21, "0.000012345678901234568");
2034     testFormatValue(12345678901234567e-22, "0.0000012345678901234567");
2035     testFormatValue(12345678901234567e-23, "0.00000012345678901234566");
2036     testFormatValue(12345678901234567e-24, "0.000000012345678901234567");
2037     testFormatValue(12345678901234567e-25, "0.0000000012345678901234566");
2038     testFormatValue(12345678901234567e-26, "0.00000000012345678901234568");
2039     testFormatValue(12345678901234567e-27, "0.000000000012345678901234567");
2040     testFormatValue(12345678901234567e-28, "0.0000000000012345678901234567");
2041     testFormatValue(12345678901234567e-29, "1.2345678901234566e-13");
2042 }
2043 
2044 
2045 /** Convenience function for extracting a single field from a line. See
2046  * [tsv_utils.common.utils.getTsvFieldValue] for details. This wrapper creates error
2047  * text tailored for this program.
2048  */
2049 import std.traits : isSomeChar;
2050 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, ulong lineNum) pure @safe
2051 if (isSomeChar!C)
2052 {
2053     import std.conv : ConvException, to;
2054     import std.format : format;
2055     import tsv_utils.common.utils : getTsvFieldValue;
2056 
2057     T val;
2058     try
2059     {
2060         val = getTsvFieldValue!T(line, fieldIndex, delim);
2061     }
2062     catch (ConvException exc)
2063     {
2064         throw new Exception(
2065             format("Could not process line: %s\n  File: %s Line: %s%s",
2066                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
2067                    (lineNum == 1) ? "\n  Is this a header line? Use --H|header to skip." : ""));
2068     }
2069     catch (Exception exc)
2070     {
2071         /* Not enough fields on the line. */
2072         throw new Exception(
2073             format("Could not process line: %s\n  File: %s Line: %s",
2074                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum));
2075     }
2076 
2077     return val;
2078 }
2079 
2080 @safe unittest
2081 {
2082     /* getFieldValue unit tests. getTsvFieldValue has it's own tests.
2083      * These tests make basic sanity checks on the getFieldValue wrapper.
2084      */
2085     import std.exception;
2086 
2087     assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123);
2088     assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4);
2089     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1));
2090     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2));
2091     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1));
2092     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2));
2093 }
2094 
2095 /* Unit tests for the main program start here.
2096  *
2097  * Portability note: Many of the tests here rely on generating consistent random numbers
2098  * across different platforms when using the same random seed. So far this has succeeded
2099  * on several different platform, compiler, and library versions. However, it is certainly
2100  * possible this condition will not hold on other platforms.
2101  *
2102  * For tsv-sample, this portability implies generating the same results on different
2103  * platforms when using the same random seed. This is NOT part of tsv-sample guarantees,
2104  * but it is convenient for testing. If platforms are identified that do not generate
2105  * the same results these tests will need to be adjusted.
2106  */
2107 version(unittest)
2108 {
2109     /* Unit test helper functions. */
2110 
2111     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
2112     import std.conv : to;
2113 
2114     void testTsvSample(string[] cmdArgs, string[][] expected)
2115     {
2116         import std.array : appender;
2117         import std.format : format;
2118 
2119         assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty.");
2120 
2121         auto formatAssertMessage(T...)(string msg, T formatArgs)
2122         {
2123             auto formatString = "[testTsvSample] %s: " ~ msg;
2124             return format(formatString, cmdArgs[0], formatArgs);
2125         }
2126 
2127         TsvSampleOptions cmdopt;
2128         auto savedCmdArgs = cmdArgs.to!string;
2129         auto r = cmdopt.processArgs(cmdArgs);
2130         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
2131         auto output = appender!(char[])();
2132 
2133         tsvSample(cmdopt, output);    // This invokes the main code line.
2134 
2135         auto expectedOutput = expected.tsvDataToString;
2136 
2137         assert(output.data == expectedOutput,
2138                formatAssertMessage(
2139                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
2140                    expectedOutput.to!string, output.data.to!string));
2141     }
2142  }
2143 
2144 unittest
2145 {
2146     import std.path : buildPath;
2147     import std.file : rmdirRecurse;
2148     import std.format : format;
2149 
2150     auto testDir = makeUnittestTempDir("tsv_sample");
2151     scope(exit) testDir.rmdirRecurse;
2152 
2153     /* Tabular data sets and expected results use the built-in static seed.
2154      * Tests are run by writing the data set to a file, then calling the main
2155      * routine to process. The function testTsvSample plays the role of the
2156      * main program. Rather than writing to expected output, the results are
2157      * matched against expected. The expected results were verified by hand
2158      * prior to inclusion in the test.
2159      *
2160      * The initial part of this section is simply setting up data files and
2161      * expected results.
2162      *
2163      * Expected results naming conventions:
2164      *  - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected
2165      *  - Sampling Type (required): Permute (Shuffle), Sample, Replace, Bernoulli, Distinct
2166      *  - Compatibility: Compat, AlgoR, Skip, Swap, Inorder
2167      *  - Weight Field: Wt<num>, e.g. Wt3
2168      *  - Sample Size: Num<num>, eg. Num3
2169      *  - Seed Value: V<num>, eg. V77
2170      *  - Key Field: K<num>, e.g. K2
2171      *  - Probability: P<num>, e.g P05 (5%)
2172      *  - Printing Probabilities: Probs
2173      *  - Printing Probs in order: ProbsInorder
2174      *  - Printing Probs with custom header: RVCustom
2175      */
2176 
2177     /* Empty file. */
2178     string[][] dataEmpty = [];
2179     string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv");
2180     writeUnittestTsvFile(fpath_dataEmpty, dataEmpty);
2181 
2182     /* 3x1, header only. */
2183     string[][] data3x0 = [["field_a", "field_b", "field_c"]];
2184     string fpath_data3x0 = buildPath(testDir, "data3x0.tsv");
2185     writeUnittestTsvFile(fpath_data3x0, data3x0);
2186 
2187     /* 3x1 */
2188     string[][] data3x1 =
2189         [["field_a", "field_b", "field_c"],
2190          ["tan", "タン", "8.5"]];
2191 
2192     string fpath_data3x1 = buildPath(testDir, "data3x1.tsv");
2193     string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv");
2194     writeUnittestTsvFile(fpath_data3x1, data3x1);
2195     writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1 .. $]);
2196 
2197     string[][] data3x1ExpectedReplaceNum3 =
2198         [["field_a", "field_b", "field_c"],
2199          ["tan", "タン", "8.5"],
2200          ["tan", "タン", "8.5"],
2201          ["tan", "タン", "8.5"]];
2202 
2203     /* 3x2 */
2204     string[][] data3x2 =
2205         [["field_a", "field_b", "field_c"],
2206          ["brown", "褐色", "29.2"],
2207          ["gray", "グレー", "6.2"]];
2208 
2209     string fpath_data3x2 = buildPath(testDir, "data3x2.tsv");
2210     string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv");
2211     writeUnittestTsvFile(fpath_data3x2, data3x2);
2212     writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1 .. $]);
2213 
2214     string[][] data3x2PermuteCompat =
2215         [["field_a", "field_b", "field_c"],
2216          ["gray", "グレー", "6.2"],
2217          ["brown", "褐色", "29.2"]];
2218 
2219     string[][] data3x2PermuteShuffle =
2220         [["field_a", "field_b", "field_c"],
2221          ["gray", "グレー", "6.2"],
2222          ["brown", "褐色", "29.2"]];
2223 
2224     /* 3x3 */
2225     string[][] data3x3 =
2226         [["field_a", "field_b", "field_c"],
2227          ["orange", "オレンジ", "2.5"],
2228          ["pink", "ピンク", "1.1"],
2229          ["purple", "紫の", "42"]];
2230 
2231     string fpath_data3x3 = buildPath(testDir, "data3x3.tsv");
2232     string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv");
2233     writeUnittestTsvFile(fpath_data3x3, data3x3);
2234     writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1 .. $]);
2235 
2236     string[][] data3x3ExpectedPermuteCompat =
2237         [["field_a", "field_b", "field_c"],
2238          ["purple", "紫の", "42"],
2239          ["pink", "ピンク", "1.1"],
2240          ["orange", "オレンジ", "2.5"]];
2241 
2242     string[][] data3x3ExpectedPermuteSwap =
2243         [["field_a", "field_b", "field_c"],
2244          ["purple", "紫の", "42"],
2245          ["orange", "オレンジ", "2.5"],
2246          ["pink", "ピンク", "1.1"]];
2247 
2248     /* 3x6 */
2249     string[][] data3x6 =
2250         [["field_a", "field_b", "field_c"],
2251          ["red", "赤", "23.8"],
2252          ["green", "緑", "0.0072"],
2253          ["white", "白", "1.65"],
2254          ["yellow", "黄", "12"],
2255          ["blue", "青", "12"],
2256          ["black", "黒", "0.983"]];
2257     string fpath_data3x6 = buildPath(testDir, "data3x6.tsv");
2258     string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv");
2259     writeUnittestTsvFile(fpath_data3x6, data3x6);
2260     writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1 .. $]);
2261 
2262     // Randomization, all lines
2263     string[][] data3x6ExpectedPermuteCompat =
2264         [["field_a", "field_b", "field_c"],
2265          ["yellow", "黄", "12"],
2266          ["black", "黒", "0.983"],
2267          ["blue", "青", "12"],
2268          ["white", "白", "1.65"],
2269          ["green", "緑", "0.0072"],
2270          ["red", "赤", "23.8"]];
2271 
2272     string[][] data3x6ExpectedPermuteSwap =
2273         [["field_a", "field_b", "field_c"],
2274          ["black", "黒", "0.983"],
2275          ["green", "緑", "0.0072"],
2276          ["red", "赤", "23.8"],
2277          ["yellow", "黄", "12"],
2278          ["white", "白", "1.65"],
2279          ["blue", "青", "12"]];
2280 
2281     string[][] data3x6ExpectedPermuteCompatProbs =
2282         [["random_value", "field_a", "field_b", "field_c"],
2283          ["0.96055546286515892", "yellow", "黄", "12"],
2284          ["0.75710153928957880", "black", "黒", "0.983"],
2285          ["0.52525980887003243", "blue", "青", "12"],
2286          ["0.49287854949943721", "white", "白", "1.65"],
2287          ["0.15929344086907804", "green", "緑", "0.0072"],
2288          ["0.010968807619065046", "red", "赤", "23.8"]];
2289 
2290     /* Note: data3x6ExpectedSampleAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because
2291      * both are effectively the same algorithm given that --num is data length. Both read
2292      * in the full data in order then call randomShuffle.
2293      */
2294     string[][] data3x6ExpectedSampleAlgoRNum6 =
2295         [["field_a", "field_b", "field_c"],
2296          ["black", "黒", "0.983"],
2297          ["green", "緑", "0.0072"],
2298          ["red", "赤", "23.8"],
2299          ["yellow", "黄", "12"],
2300          ["white", "白", "1.65"],
2301          ["blue", "青", "12"]];
2302 
2303     string[][] data3x6ExpectedSampleAlgoRNum5 =
2304         [["field_a", "field_b", "field_c"],
2305          ["red", "赤", "23.8"],
2306          ["black", "黒", "0.983"],
2307          ["white", "白", "1.65"],
2308          ["green", "緑", "0.0072"],
2309          ["yellow", "黄", "12"]];
2310 
2311     string[][] data3x6ExpectedSampleAlgoRNum4 =
2312         [["field_a", "field_b", "field_c"],
2313          ["blue", "青", "12"],
2314          ["green", "緑", "0.0072"],
2315          ["black", "黒", "0.983"],
2316          ["white", "白", "1.65"]];
2317 
2318     string[][] data3x6ExpectedSampleAlgoRNum3 =
2319         [["field_a", "field_b", "field_c"],
2320          ["red", "赤", "23.8"],
2321          ["black", "黒", "0.983"],
2322          ["green", "緑", "0.0072"]];
2323 
2324     string[][] data3x6ExpectedSampleAlgoRNum2 =
2325         [["field_a", "field_b", "field_c"],
2326          ["black", "黒", "0.983"],
2327          ["red", "赤", "23.8"]];
2328 
2329     string[][] data3x6ExpectedSampleAlgoRNum1 =
2330         [["field_a", "field_b", "field_c"],
2331          ["green", "緑", "0.0072"]];
2332 
2333     /* Inorder versions. */
2334     string[][] data3x6ExpectedSampleAlgoRNum6Inorder =
2335         [["field_a", "field_b", "field_c"],
2336          ["red", "赤", "23.8"],
2337          ["green", "緑", "0.0072"],
2338          ["white", "白", "1.65"],
2339          ["yellow", "黄", "12"],
2340          ["blue", "青", "12"],
2341          ["black", "黒", "0.983"]];
2342 
2343     string[][] data3x6ExpectedSampleAlgoRNum5Inorder =
2344         [["field_a", "field_b", "field_c"],
2345          ["red", "赤", "23.8"],
2346          ["green", "緑", "0.0072"],
2347          ["white", "白", "1.65"],
2348          ["yellow", "黄", "12"],
2349          ["black", "黒", "0.983"]];
2350 
2351     string[][] data3x6ExpectedSampleAlgoRNum4Inorder =
2352         [["field_a", "field_b", "field_c"],
2353          ["green", "緑", "0.0072"],
2354          ["white", "白", "1.65"],
2355          ["blue", "青", "12"],
2356          ["black", "黒", "0.983"]];
2357 
2358     string[][] data3x6ExpectedSampleAlgoRNum3Inorder =
2359         [["field_a", "field_b", "field_c"],
2360          ["red", "赤", "23.8"],
2361          ["green", "緑", "0.0072"],
2362          ["black", "黒", "0.983"]];
2363 
2364     string[][] data3x6ExpectedSampleAlgoRNum2Inorder =
2365         [["field_a", "field_b", "field_c"],
2366          ["red", "赤", "23.8"],
2367          ["black", "黒", "0.983"]];
2368 
2369     string[][] data3x6ExpectedSampleAlgoRNum1Inorder =
2370         [["field_a", "field_b", "field_c"],
2371          ["green", "緑", "0.0072"]];
2372 
2373     /* Reservoir inorder */
2374     string[][] data3x6ExpectedSampleCompatNum6Inorder =
2375         [["field_a", "field_b", "field_c"],
2376          ["red", "赤", "23.8"],
2377          ["green", "緑", "0.0072"],
2378          ["white", "白", "1.65"],
2379          ["yellow", "黄", "12"],
2380          ["blue", "青", "12"],
2381          ["black", "黒", "0.983"]];
2382 
2383     string[][] data3x6ExpectedSampleCompatNum5Inorder =
2384         [["field_a", "field_b", "field_c"],
2385          ["green", "緑", "0.0072"],
2386          ["white", "白", "1.65"],
2387          ["yellow", "黄", "12"],
2388          ["blue", "青", "12"],
2389          ["black", "黒", "0.983"]];
2390 
2391     string[][] data3x6ExpectedSampleCompatNum4Inorder =
2392         [["field_a", "field_b", "field_c"],
2393          ["white", "白", "1.65"],
2394          ["yellow", "黄", "12"],
2395          ["blue", "青", "12"],
2396          ["black", "黒", "0.983"]];
2397 
2398     string[][] data3x6ExpectedSampleCompatNum3Inorder =
2399         [["field_a", "field_b", "field_c"],
2400          ["yellow", "黄", "12"],
2401          ["blue", "青", "12"],
2402          ["black", "黒", "0.983"]];
2403 
2404     string[][] data3x6ExpectedSampleCompatNum2Inorder =
2405         [["field_a", "field_b", "field_c"],
2406          ["yellow", "黄", "12"],
2407          ["black", "黒", "0.983"]];
2408 
2409     string[][] data3x6ExpectedSampleCompatNum1Inorder =
2410         [["field_a", "field_b", "field_c"],
2411          ["yellow", "黄", "12"]];
2412 
2413 
2414     /* Reservoir inorder with probabilities. */
2415     string[][] data3x6ExpectedSampleCompatNum6ProbsInorder =
2416         [["random_value", "field_a", "field_b", "field_c"],
2417          ["0.010968807619065046", "red", "赤", "23.8"],
2418          ["0.15929344086907804", "green", "緑", "0.0072"],
2419          ["0.49287854949943721", "white", "白", "1.65"],
2420          ["0.96055546286515892", "yellow", "黄", "12"],
2421          ["0.52525980887003243", "blue", "青", "12"],
2422          ["0.75710153928957880", "black", "黒", "0.983"]];
2423 
2424     string[][] data3x6ExpectedSampleCompatNum5ProbsInorder =
2425         [["random_value", "field_a", "field_b", "field_c"],
2426          ["0.15929344086907804", "green", "緑", "0.0072"],
2427          ["0.49287854949943721", "white", "白", "1.65"],
2428          ["0.96055546286515892", "yellow", "黄", "12"],
2429          ["0.52525980887003243", "blue", "青", "12"],
2430          ["0.75710153928957880", "black", "黒", "0.983"]];
2431 
2432     string[][] data3x6ExpectedSampleCompatNum4ProbsInorder =
2433         [["random_value", "field_a", "field_b", "field_c"],
2434          ["0.49287854949943721", "white", "白", "1.65"],
2435          ["0.96055546286515892", "yellow", "黄", "12"],
2436          ["0.52525980887003243", "blue", "青", "12"],
2437          ["0.75710153928957880", "black", "黒", "0.983"]];
2438 
2439     string[][] data3x6ExpectedSampleCompatNum3ProbsInorder =
2440         [["random_value", "field_a", "field_b", "field_c"],
2441          ["0.96055546286515892", "yellow", "黄", "12"],
2442          ["0.52525980887003243", "blue", "青", "12"],
2443          ["0.75710153928957880", "black", "黒", "0.983"]];
2444 
2445     string[][] data3x6ExpectedSampleCompatNum2ProbsInorder =
2446         [["random_value", "field_a", "field_b", "field_c"],
2447          ["0.96055546286515892", "yellow", "黄", "12"],
2448          ["0.75710153928957880", "black", "黒", "0.983"]];
2449 
2450     string[][] data3x6ExpectedSampleCompatNum1ProbsInorder =
2451         [["random_value", "field_a", "field_b", "field_c"],
2452          ["0.96055546286515892", "yellow", "黄", "12"]];
2453 
2454     string[][] data3x6ExpectedWt3Num6Inorder =
2455         [["field_a", "field_b", "field_c"],
2456          ["red", "赤", "23.8"],
2457          ["green", "緑", "0.0072"],
2458          ["white", "白", "1.65"],
2459          ["yellow", "黄", "12"],
2460          ["blue", "青", "12"],
2461          ["black", "黒", "0.983"]];
2462 
2463     string[][] data3x6ExpectedWt3Num5Inorder =
2464         [["field_a", "field_b", "field_c"],
2465          ["green", "緑", "0.0072"],
2466          ["white", "白", "1.65"],
2467          ["yellow", "黄", "12"],
2468          ["blue", "青", "12"],
2469          ["black", "黒", "0.983"]];
2470 
2471     string[][] data3x6ExpectedWt3Num4Inorder =
2472         [["field_a", "field_b", "field_c"],
2473          ["white", "白", "1.65"],
2474          ["yellow", "黄", "12"],
2475          ["blue", "青", "12"],
2476          ["black", "黒", "0.983"]];
2477 
2478     string[][] data3x6ExpectedWt3Num3Inorder =
2479         [["field_a", "field_b", "field_c"],
2480          ["yellow", "黄", "12"],
2481          ["blue", "青", "12"],
2482          ["black", "黒", "0.983"]];
2483 
2484     string[][] data3x6ExpectedWt3Num2Inorder =
2485         [["field_a", "field_b", "field_c"],
2486          ["yellow", "黄", "12"],
2487          ["black", "黒", "0.983"]];
2488 
2489     string[][] data3x6ExpectedWt3Num1Inorder =
2490         [["field_a", "field_b", "field_c"],
2491          ["yellow", "黄", "12"]];
2492 
2493 
2494     string[][] data3x6ExpectedBernoulliProbsP100 =
2495         [["random_value", "field_a", "field_b", "field_c"],
2496          ["0.010968807619065046", "red", "赤", "23.8"],
2497          ["0.15929344086907804", "green", "緑", "0.0072"],
2498          ["0.49287854949943721", "white", "白", "1.65"],
2499          ["0.96055546286515892", "yellow", "黄", "12"],
2500          ["0.52525980887003243", "blue", "青", "12"],
2501          ["0.75710153928957880", "black", "黒", "0.983"]];
2502 
2503     string[][] data3x6ExpectedBernoulliCompatProbsP60 =
2504         [["random_value", "field_a", "field_b", "field_c"],
2505          ["0.010968807619065046", "red", "赤", "23.8"],
2506          ["0.15929344086907804", "green", "緑", "0.0072"],
2507          ["0.49287854949943721", "white", "白", "1.65"],
2508          ["0.52525980887003243", "blue", "青", "12"]];
2509 
2510     string[][] data3x6ExpectedBernoulliSkipP40 =
2511         [["field_a", "field_b", "field_c"],
2512          ["red", "赤", "23.8"],
2513          ["green", "緑", "0.0072"],
2514          ["yellow", "黄", "12"]];
2515 
2516     string[][] data3x6ExpectedBernoulliCompatP60 =
2517         [["field_a", "field_b", "field_c"],
2518          ["red", "赤", "23.8"],
2519          ["green", "緑", "0.0072"],
2520          ["white", "白", "1.65"],
2521          ["blue", "青", "12"]];
2522 
2523     string[][] data3x6ExpectedDistinctK1K3P60 =
2524         [["field_a", "field_b", "field_c"],
2525          ["green", "緑", "0.0072"],
2526          ["white", "白", "1.65"],
2527          ["blue", "青", "12"]];
2528 
2529     string[][] data3x6ExpectedDistinctK1K3P60Probs =
2530         [["random_value", "field_a", "field_b", "field_c"],
2531          ["0", "green", "緑", "0.0072"],
2532          ["0", "white", "白", "1.65"],
2533          ["0", "blue", "青", "12"]];
2534 
2535     string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom =
2536         [["custom_random_value_header", "field_a", "field_b", "field_c"],
2537          ["0", "green", "緑", "0.0072"],
2538          ["0", "white", "白", "1.65"],
2539          ["0", "blue", "青", "12"]];
2540 
2541     string[][] data3x6ExpectedDistinctK2P2ProbsInorder =
2542         [["random_value", "field_a", "field_b", "field_c"],
2543          ["1", "red", "赤", "23.8"],
2544          ["0", "green", "緑", "0.0072"],
2545          ["0", "white", "白", "1.65"],
2546          ["1", "yellow", "黄", "12"],
2547          ["3", "blue", "青", "12"],
2548          ["2", "black", "黒", "0.983"]];
2549 
2550     string[][] data3x6ExpectedPermuteWt3Probs =
2551         [["random_value", "field_a", "field_b", "field_c"],
2552          ["0.99665198757645390", "yellow", "黄", "12"],
2553          ["0.94775884809836686", "blue", "青", "12"],
2554          ["0.82728234682286661", "red", "赤", "23.8"],
2555          ["0.75346697377181959", "black", "黒", "0.983"],
2556          ["0.65130103496422487", "white", "白", "1.65"],
2557          ["1.5636943712879866e-111", "green", "緑", "0.0072"]];
2558 
2559     string[][] data3x6ExpectedWt3ProbsInorder =
2560         [["random_value", "field_a", "field_b", "field_c"],
2561          ["0.82728234682286661", "red", "赤", "23.8"],
2562          ["1.5636943712879866e-111", "green", "緑", "0.0072"],
2563          ["0.65130103496422487", "white", "白", "1.65"],
2564          ["0.99665198757645390", "yellow", "黄", "12"],
2565          ["0.94775884809836686", "blue", "青", "12"],
2566          ["0.75346697377181959", "black", "黒", "0.983"]];
2567 
2568     string[][] data3x6ExpectedPermuteWt3 =
2569         [["field_a", "field_b", "field_c"],
2570          ["yellow", "黄", "12"],
2571          ["blue", "青", "12"],
2572          ["red", "赤", "23.8"],
2573          ["black", "黒", "0.983"],
2574          ["white", "白", "1.65"],
2575          ["green", "緑", "0.0072"]];
2576 
2577 
2578     string[][] data3x6ExpectedReplaceNum10 =
2579         [["field_a", "field_b", "field_c"],
2580          ["black", "黒", "0.983"],
2581          ["green", "緑", "0.0072"],
2582          ["green", "緑", "0.0072"],
2583          ["red", "赤", "23.8"],
2584          ["yellow", "黄", "12"],
2585          ["red", "赤", "23.8"],
2586          ["white", "白", "1.65"],
2587          ["yellow", "黄", "12"],
2588          ["yellow", "黄", "12"],
2589          ["white", "白", "1.65"],
2590         ];
2591 
2592     string[][] data3x6ExpectedReplaceNum10V77 =
2593         [["field_a", "field_b", "field_c"],
2594          ["black", "黒", "0.983"],
2595          ["red", "赤", "23.8"],
2596          ["black", "黒", "0.983"],
2597          ["yellow", "黄", "12"],
2598          ["green", "緑", "0.0072"],
2599          ["green", "緑", "0.0072"],
2600          ["green", "緑", "0.0072"],
2601          ["yellow", "黄", "12"],
2602          ["blue", "青", "12"],
2603          ["white", "白", "1.65"],
2604         ];
2605 
2606     /* Using a different static seed. */
2607     string[][] data3x6ExpectedPermuteCompatV41Probs =
2608         [["random_value", "field_a", "field_b", "field_c"],
2609          ["0.68057272653095424", "green", "緑", "0.0072"],
2610          ["0.67681624367833138", "blue", "青", "12"],
2611          ["0.32097338931635022", "yellow", "黄", "12"],
2612          ["0.25092361867427826", "red", "赤", "23.8"],
2613          ["0.15535934292711318", "black", "黒", "0.983"],
2614          ["0.046095821075141430", "white", "白", "1.65"]];
2615 
2616     string[][] data3x6ExpectedBernoulliCompatP60V41Probs =
2617         [["random_value", "field_a", "field_b", "field_c"],
2618          ["0.25092361867427826", "red", "赤", "23.8"],
2619          ["0.046095821075141430", "white", "白", "1.65"],
2620          ["0.32097338931635022", "yellow", "黄", "12"],
2621          ["0.15535934292711318", "black", "黒", "0.983"]];
2622 
2623     string[][] data3x6ExpectedPermuteWt3V41Probs =
2624         [["random_value", "field_a", "field_b", "field_c"],
2625          ["0.96799377498910666", "blue", "青", "12"],
2626          ["0.94356245792573568", "red", "赤", "23.8"],
2627          ["0.90964601024271996", "yellow", "黄", "12"],
2628          ["0.15491658409260103", "white", "白", "1.65"],
2629          ["0.15043620392537033", "black", "黒", "0.983"],
2630          ["6.1394674830701461e-24", "green", "緑", "0.0072"]];
2631 
2632     string[][] data3x6ExpectedWt3V41ProbsInorder =
2633         [["random_value", "field_a", "field_b", "field_c"],
2634          ["0.94356245792573568", "red", "赤", "23.8"],
2635          ["6.1394674830701461e-24", "green", "緑", "0.0072"],
2636          ["0.15491658409260103", "white", "白", "1.65"],
2637          ["0.90964601024271996", "yellow", "黄", "12"],
2638          ["0.96799377498910666", "blue", "青", "12"],
2639          ["0.15043620392537033", "black", "黒", "0.983"]];
2640 
2641 
2642     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
2643     string[][] combo1ExpectedPermuteCompat =
2644         [["field_a", "field_b", "field_c"],
2645          ["yellow", "黄", "12"],
2646          ["tan", "タン", "8.5"],
2647          ["brown", "褐色", "29.2"],
2648          ["green", "緑", "0.0072"],
2649          ["red", "赤", "23.8"],
2650          ["purple", "紫の", "42"],
2651          ["black", "黒", "0.983"],
2652          ["white", "白", "1.65"],
2653          ["gray", "グレー", "6.2"],
2654          ["blue", "青", "12"],
2655          ["pink", "ピンク", "1.1"],
2656          ["orange", "オレンジ", "2.5"]];
2657 
2658     string[][] combo1ExpectedPermuteCompatProbs =
2659         [["random_value", "field_a", "field_b", "field_c"],
2660          ["0.97088520275428891", "yellow", "黄", "12"],
2661          ["0.96055546286515892", "tan", "タン", "8.5"],
2662          ["0.81756894313730299", "brown", "褐色", "29.2"],
2663          ["0.75710153928957880", "green", "緑", "0.0072"],
2664          ["0.52525980887003243", "red", "赤", "23.8"],
2665          ["0.49287854949943721", "purple", "紫の", "42"],
2666          ["0.47081507067196071", "black", "黒", "0.983"],
2667          ["0.38388182921335101", "white", "白", "1.65"],
2668          ["0.29215990612283349", "gray", "グレー", "6.2"],
2669          ["0.24033216014504433", "blue", "青", "12"],
2670          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2671          ["0.010968807619065046", "orange", "オレンジ", "2.5"]];
2672 
2673     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
2674     string[][] combo1ExpectedProbsInorder =
2675         [["random_value", "field_a", "field_b", "field_c"],
2676          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
2677          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2678          ["0.49287854949943721", "purple", "紫の", "42"],
2679          ["0.96055546286515892", "tan", "タン", "8.5"],
2680          ["0.52525980887003243", "red", "赤", "23.8"],
2681          ["0.75710153928957880", "green", "緑", "0.0072"],
2682          ["0.38388182921335101", "white", "白", "1.65"],
2683          ["0.97088520275428891", "yellow", "黄", "12"],
2684          ["0.24033216014504433", "blue", "青", "12"],
2685          ["0.47081507067196071", "black", "黒", "0.983"],
2686          ["0.81756894313730299", "brown", "褐色", "29.2"],
2687          ["0.29215990612283349", "gray", "グレー", "6.2"]];
2688 
2689     string[][] combo1ExpectedBernoulliCompatP50Probs =
2690         [["random_value", "field_a", "field_b", "field_c"],
2691          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
2692          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2693          ["0.49287854949943721", "purple", "紫の", "42"],
2694          ["0.38388182921335101", "white", "白", "1.65"],
2695          ["0.24033216014504433", "blue", "青", "12"],
2696          ["0.47081507067196071", "black", "黒", "0.983"],
2697          ["0.29215990612283349", "gray", "グレー", "6.2"]];
2698 
2699     string[][] combo1ExpectedBernoulliCompatP40 =
2700         [["field_a", "field_b", "field_c"],
2701          ["orange", "オレンジ", "2.5"],
2702          ["pink", "ピンク", "1.1"],
2703          ["white", "白", "1.65"],
2704          ["blue", "青", "12"],
2705          ["gray", "グレー", "6.2"]];
2706 
2707     string[][] combo1ExpectedDistinctK1P40 =
2708         [["field_a", "field_b", "field_c"],
2709          ["orange", "オレンジ", "2.5"],
2710          ["red", "赤", "23.8"],
2711          ["green", "緑", "0.0072"],
2712          ["blue", "青", "12"],
2713          ["black", "黒", "0.983"]];
2714 
2715     string[][] combo1ExpectedPermuteWt3Probs =
2716         [["random_value", "field_a", "field_b", "field_c"],
2717          ["0.99754077523718754", "yellow", "黄", "12"],
2718          ["0.99527665440088786", "tan", "タン", "8.5"],
2719          ["0.99312578945741659", "brown", "褐色", "29.2"],
2720          ["0.98329602553389361", "purple", "紫の", "42"],
2721          ["0.97330961938083660", "red", "赤", "23.8"],
2722          ["0.88797551521739648", "blue", "青", "12"],
2723          ["0.81999230489041786", "gray", "グレー", "6.2"],
2724          ["0.55975569204250941", "white", "白", "1.65"],
2725          ["0.46472135609205739", "black", "黒", "0.983"],
2726          ["0.18824582704191337", "pink", "ピンク", "1.1"],
2727          ["0.16446131853299920", "orange", "オレンジ", "2.5"],
2728          ["1.6438086931020549e-17", "green", "緑", "0.0072"]];
2729 
2730     string[][] combo1ExpectedPermuteWt3 =
2731         [["field_a", "field_b", "field_c"],
2732          ["yellow", "黄", "12"],
2733          ["tan", "タン", "8.5"],
2734          ["brown", "褐色", "29.2"],
2735          ["purple", "紫の", "42"],
2736          ["red", "赤", "23.8"],
2737          ["blue", "青", "12"],
2738          ["gray", "グレー", "6.2"],
2739          ["white", "白", "1.65"],
2740          ["black", "黒", "0.983"],
2741          ["pink", "ピンク", "1.1"],
2742          ["orange", "オレンジ", "2.5"],
2743          ["green", "緑", "0.0072"]];
2744 
2745         string[][] combo1ExpectedSampleAlgoRNum4 =
2746         [["field_a", "field_b", "field_c"],
2747          ["blue", "青", "12"],
2748          ["gray", "グレー", "6.2"],
2749          ["brown", "褐色", "29.2"],
2750          ["white", "白", "1.65"]];
2751 
2752         string[][] combo1ExpectedSampleAlgoRNum4Inorder =
2753         [["field_a", "field_b", "field_c"],
2754          ["white", "白", "1.65"],
2755          ["blue", "青", "12"],
2756          ["brown", "褐色", "29.2"],
2757          ["gray", "グレー", "6.2"]];
2758 
2759     string[][] combo1ExpectedReplaceNum10 =
2760         [["field_a", "field_b", "field_c"],
2761          ["gray", "グレー", "6.2"],
2762          ["yellow", "黄", "12"],
2763          ["yellow", "黄", "12"],
2764          ["white", "白", "1.65"],
2765          ["tan", "タン", "8.5"],
2766          ["white", "白", "1.65"],
2767          ["blue", "青", "12"],
2768          ["black", "黒", "0.983"],
2769          ["tan", "タン", "8.5"],
2770          ["purple", "紫の", "42"]];
2771 
2772     /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */
2773     string[][] data1x200 =
2774         [["field_a"],
2775          ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"],
2776          ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"],
2777          ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"],
2778          ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"],
2779          ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"],
2780          ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"],
2781          ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"],
2782          ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"],
2783          ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"],
2784          ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"],
2785          ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"],
2786          ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"],
2787          ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"],
2788          ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"],
2789          ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"],
2790          ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"],
2791          ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"],
2792          ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"],
2793          ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"],
2794          ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"],
2795         ];
2796 
2797     string fpath_data1x200 = buildPath(testDir, "data1x200.tsv");
2798     string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv");
2799     writeUnittestTsvFile(fpath_data1x200, data1x200);
2800     writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1 .. $]);
2801 
2802     string[][] data1x200ExpectedBernoulliSkipV333P01 =
2803         [["field_a"],
2804          ["077"],
2805          ["119"]];
2806 
2807     string[][] data1x200ExpectedBernoulliSkipV333P02 =
2808         [["field_a"],
2809          ["038"],
2810          ["059"],
2811          ["124"],
2812          ["161"],
2813          ["162"],
2814          ["183"]];
2815 
2816     string[][] data1x200ExpectedBernoulliSkipV333P03 =
2817         [["field_a"],
2818          ["025"],
2819          ["039"],
2820          ["082"],
2821          ["107"],
2822          ["108"],
2823          ["122"],
2824          ["136"],
2825          ["166"],
2826          ["182"]];
2827 
2828     string[][] data1x200ExpectedBernoulliCompatV333P01 =
2829         [["field_a"],
2830          ["072"]];
2831 
2832     string[][] data1x200ExpectedBernoulliCompatV333P02 =
2833         [["field_a"],
2834          ["004"],
2835          ["072"]];
2836 
2837     string[][] data1x200ExpectedBernoulliCompatV333P03 =
2838         [["field_a"],
2839          ["004"],
2840          ["072"],
2841          ["181"]];
2842 
2843     /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files,
2844      * only expected results. The header is from 3x0, the results are offset 1-position
2845      * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line.
2846      */
2847     string[][] combo2ExpectedBernoulliSkipV333P03 =
2848         [["field_a", "field_b", "field_c"],
2849          ["024"],
2850          ["038"],
2851          ["081"],
2852          ["106"],
2853          ["107"],
2854          ["121"],
2855          ["135"],
2856          ["165"],
2857          ["181"]];
2858 
2859 
2860     /* 1x10 - Simple 1-column file. */
2861     string[][] data1x10 =
2862         [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]];
2863     string fpath_data1x10 = buildPath(testDir, "data1x10.tsv");
2864     string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv");
2865     writeUnittestTsvFile(fpath_data1x10, data1x10);
2866     writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1 .. $]);
2867 
2868     string[][] data1x10ExpectedPermuteCompat =
2869         [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]];
2870 
2871     string[][] data1x10ExpectedPermuteWt1 =
2872         [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]];
2873 
2874     /* 2x10a - Uniform distribution [0,1]. */
2875     string[][] data2x10a =
2876         [["line", "weight"],
2877          ["1", "0.26788837"],
2878          ["2", "0.06601298"],
2879          ["3", "0.38627527"],
2880          ["4", "0.47379424"],
2881          ["5", "0.02966641"],
2882          ["6", "0.05636231"],
2883          ["7", "0.70529242"],
2884          ["8", "0.91836862"],
2885          ["9", "0.99103720"],
2886          ["10", "0.31401740"]];
2887 
2888     string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv");
2889     writeUnittestTsvFile(fpath_data2x10a, data2x10a);
2890 
2891     string[][] data2x10aExpectedPermuteWt2Probs =
2892         [["random_value", "line", "weight"],
2893          ["0.96833865494543658", "8", "0.91836862"],
2894          ["0.91856842054413923", "4", "0.47379424"],
2895          ["0.25730832087795091", "7", "0.70529242"],
2896          ["0.23725317907018120", "9", "0.99103720"],
2897          ["0.16016096701872204", "3", "0.38627527"],
2898          ["0.090819662667243381", "10", "0.31401740"],
2899          ["0.0071764539244361172", "6", "0.05636231"],
2900          ["0.000000048318642951630057", "1", "0.26788837"],
2901          ["0.00000000037525692966535517", "5", "0.02966641"],
2902          ["8.2123247880095796e-13", "2", "0.06601298"]];
2903 
2904     /* 2x10b - Uniform distribution [0,1000]. */
2905     string[][] data2x10b =
2906         [["line", "weight"],
2907          ["1", "761"],
2908          ["2", "432"],
2909          ["3", "103"],
2910          ["4", "448"],
2911          ["5", "750"],
2912          ["6", "711"],
2913          ["7", "867"],
2914          ["8", "841"],
2915          ["9", "963"],
2916          ["10", "784"]];
2917 
2918     string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv");
2919     writeUnittestTsvFile(fpath_data2x10b, data2x10b);
2920 
2921     string[][] data2x10bExpectedPermuteWt2Probs =
2922         [["random_value", "line", "weight"],
2923          ["0.99996486739067969", "8", "841"],
2924          ["0.99991017467137211", "4", "448"],
2925          ["0.99960871524873662", "6", "711"],
2926          ["0.99914188537143800", "5", "750"],
2927          ["0.99903963250274785", "10", "784"],
2928          ["0.99889631825931946", "7", "867"],
2929          ["0.99852058315191139", "9", "963"],
2930          ["0.99575669679158918", "2", "432"],
2931          ["0.99408758732050595", "1", "761"],
2932          ["0.99315467761212362", "3", "103"]];
2933 
2934     /* 2x10c - Logarithmic distribution in random order. */
2935     string[][] data2x10c =
2936         [["line", "weight"],
2937          ["1", "31.85"],
2938          ["2", "17403.31"],
2939          ["3", "653.84"],
2940          ["4", "8.23"],
2941          ["5", "2671.04"],
2942          ["6", "26226.08"],
2943          ["7", "1.79"],
2944          ["8", "354.56"],
2945          ["9", "35213.81"],
2946          ["10", "679.29"]];
2947 
2948     string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv");
2949     writeUnittestTsvFile(fpath_data2x10c, data2x10c);
2950 
2951     string[][] data2x10cExpectedPermuteWt2Probs =
2952         [["random_value", "line", "weight"],
2953          ["0.99998939008709697", "6", "26226.08"],
2954          ["0.99995951291695517", "9", "35213.81"],
2955          ["0.99991666907613541", "8", "354.56"],
2956          ["0.99989445052186410", "2", "17403.31"],
2957          ["0.99975897602861630", "5", "2671.04"],
2958          ["0.99891852769877643", "3", "653.84"],
2959          ["0.99889167752782515", "10", "679.29"],
2960          ["0.99512207506850148", "4", "8.23"],
2961          ["0.86789371584259023", "1", "31.85"],
2962          ["0.58574438162915610", "7", "1.79"]];
2963 
2964     /* 2x10d. Logarithmic distribution in ascending order. */
2965     string[][] data2x10d =
2966         [["line", "weight"],
2967          ["1", "1.79"],
2968          ["2", "8.23"],
2969          ["3", "31.85"],
2970          ["4", "354.56"],
2971          ["5", "653.84"],
2972          ["6", "679.29"],
2973          ["7", "2671.04"],
2974          ["8", "17403.31"],
2975          ["9", "26226.08"],
2976          ["10", "35213.81"]];
2977 
2978     string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv");
2979     writeUnittestTsvFile(fpath_data2x10d, data2x10d);
2980 
2981     string[][] data2x10dExpectedPermuteWt2Probs =
2982         [["random_value", "line", "weight"],
2983          ["0.99999830221846353", "8", "17403.31"],
2984          ["0.99997860834041397", "10", "35213.81"],
2985          ["0.99994563828986716", "9", "26226.08"],
2986          ["0.99988650363575737", "4", "354.56"],
2987          ["0.99964161939190088", "7", "2671.04"],
2988          ["0.99959045338948649", "6", "679.29"],
2989          ["0.99901574490639788", "5", "653.84"],
2990          ["0.97803163304747431", "3", "31.85"],
2991          ["0.79994791806910948", "2", "8.23"],
2992          ["0.080374261239949119", "1", "1.79"]];
2993 
2994     /* 2x10e. Logarithmic distribution in descending order. */
2995     string[][] data2x10e =
2996         [["line", "weight"],
2997          ["1", "35213.81"],
2998          ["2", "26226.08"],
2999          ["3", "17403.31"],
3000          ["4", "2671.04"],
3001          ["5", "679.29"],
3002          ["6", "653.84"],
3003          ["7", "354.56"],
3004          ["8", "31.85"],
3005          ["9", "8.23"],
3006          ["10", "1.79"]];
3007     string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv");
3008     writeUnittestTsvFile(fpath_data2x10e, data2x10e);
3009 
3010     string[][] data2x10eExpectedPermuteWt2Probs =
3011         [["random_value", "line", "weight"],
3012          ["0.99998493348975237", "4", "2671.04"],
3013          ["0.99995934807202624", "3", "17403.31"],
3014          ["0.99992995739727453", "2", "26226.08"],
3015          ["0.99987185679245649", "1", "35213.81"],
3016          ["0.99957451563173938", "6", "653.84"],
3017          ["0.99907273650209583", "8", "31.85"],
3018          ["0.99905260312968946", "5", "679.29"],
3019          ["0.99730333650516401", "7", "354.56"],
3020          ["0.84093902435227808", "9", "8.23"],
3021          ["0.65650015926290028", "10", "1.79"]];
3022 
3023     /* Data sets for distinct sampling. */
3024     string[][] data5x25 =
3025         [["ID", "Shape", "Color", "Size", "Weight"],
3026          ["01", "circle", "red", "S", "10"],
3027          ["02", "circle", "black", "L", "20"],
3028          ["03", "square", "black", "L", "20"],
3029          ["04", "circle", "green", "L", "30"],
3030          ["05", "ellipse", "red", "S", "20"],
3031          ["06", "triangle", "red", "S", "10"],
3032          ["07", "triangle", "red", "L", "20"],
3033          ["08", "square", "black", "S", "10"],
3034          ["09", "circle", "black", "S", "20"],
3035          ["10", "square", "green", "L", "20"],
3036          ["11", "triangle", "red", "L", "20"],
3037          ["12", "circle", "green", "L", "30"],
3038          ["13", "ellipse", "red", "S", "20"],
3039          ["14", "circle", "green", "L", "30"],
3040          ["15", "ellipse", "red", "L", "30"],
3041          ["16", "square", "red", "S", "10"],
3042          ["17", "circle", "black", "L", "20"],
3043          ["18", "square", "red", "S", "20"],
3044          ["19", "square", "black", "L", "20"],
3045          ["20", "circle", "red", "S", "10"],
3046          ["21", "ellipse", "black", "L", "30"],
3047          ["22", "triangle", "red", "L", "30"],
3048          ["23", "circle", "green", "S", "20"],
3049          ["24", "square", "green", "L", "20"],
3050          ["25", "circle", "red", "S", "10"],
3051         ];
3052 
3053     string fpath_data5x25 = buildPath(testDir, "data5x25.tsv");
3054     string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv");
3055     writeUnittestTsvFile(fpath_data5x25, data5x25);
3056     writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1 .. $]);
3057 
3058     string[][] data5x25ExpectedDistinctK2P40 =
3059         [["ID", "Shape", "Color", "Size", "Weight"],
3060          ["03", "square", "black", "L", "20"],
3061          ["05", "ellipse", "red", "S", "20"],
3062          ["08", "square", "black", "S", "10"],
3063          ["10", "square", "green", "L", "20"],
3064          ["13", "ellipse", "red", "S", "20"],
3065          ["15", "ellipse", "red", "L", "30"],
3066          ["16", "square", "red", "S", "10"],
3067          ["18", "square", "red", "S", "20"],
3068          ["19", "square", "black", "L", "20"],
3069          ["21", "ellipse", "black", "L", "30"],
3070          ["24", "square", "green", "L", "20"],
3071         ];
3072 
3073     string[][] data5x25ExpectedDistinctK2K4P20 =
3074         [["ID", "Shape", "Color", "Size", "Weight"],
3075          ["03", "square", "black", "L", "20"],
3076          ["07", "triangle", "red", "L", "20"],
3077          ["08", "square", "black", "S", "10"],
3078          ["10", "square", "green", "L", "20"],
3079          ["11", "triangle", "red", "L", "20"],
3080          ["16", "square", "red", "S", "10"],
3081          ["18", "square", "red", "S", "20"],
3082          ["19", "square", "black", "L", "20"],
3083          ["22", "triangle", "red", "L", "30"],
3084          ["24", "square", "green", "L", "20"],
3085         ];
3086 
3087     string[][] data5x25ExpectedDistinctK2K3K4P20 =
3088         [["ID", "Shape", "Color", "Size", "Weight"],
3089          ["04", "circle", "green", "L", "30"],
3090          ["07", "triangle", "red", "L", "20"],
3091          ["09", "circle", "black", "S", "20"],
3092          ["11", "triangle", "red", "L", "20"],
3093          ["12", "circle", "green", "L", "30"],
3094          ["14", "circle", "green", "L", "30"],
3095          ["16", "square", "red", "S", "10"],
3096          ["18", "square", "red", "S", "20"],
3097          ["22", "triangle", "red", "L", "30"],
3098         ];
3099 
3100     /* Fields 2 and 4 from data5x25. Distinct rows should be the same for equiv keys. */
3101     string[][] data2x25 =
3102         [["Shape", "Size"],
3103          ["circle", "S"],
3104          ["circle", "L"],
3105          ["square", "L"],
3106          ["circle", "L"],
3107          ["ellipse", "S"],
3108          ["triangle", "S"],
3109          ["triangle", "L"],
3110          ["square", "S"],
3111          ["circle", "S"],
3112          ["square", "L"],
3113          ["triangle", "L"],
3114          ["circle", "L"],
3115          ["ellipse", "S"],
3116          ["circle", "L"],
3117          ["ellipse", "L"],
3118          ["square", "S"],
3119          ["circle", "L"],
3120          ["square", "S"],
3121          ["square", "L"],
3122          ["circle", "S"],
3123          ["ellipse", "L"],
3124          ["triangle", "L"],
3125          ["circle", "S"],
3126          ["square", "L"],
3127          ["circle", "S"],
3128         ];
3129 
3130     string fpath_data2x25 = buildPath(testDir, "data2x25.tsv");
3131     string fpath_data2x25_noheader = buildPath(testDir, "data2x25_noheader.tsv");
3132     writeUnittestTsvFile(fpath_data2x25, data2x25);
3133     writeUnittestTsvFile(fpath_data2x25_noheader, data2x25[1 .. $]);
3134 
3135     string[][] data2x25ExpectedDistinctK1K2P20 =
3136         [["Shape", "Size"],
3137          ["square", "L"],
3138          ["triangle", "L"],
3139          ["square", "S"],
3140          ["square", "L"],
3141          ["triangle", "L"],
3142          ["square", "S"],
3143          ["square", "S"],
3144          ["square", "L"],
3145          ["triangle", "L"],
3146          ["square", "L"],
3147         ];
3148 
3149     string[][] data1x25 =
3150         [["Shape-Size"],
3151          ["circle-S"],
3152          ["circle-L"],
3153          ["square-L"],
3154          ["circle-L"],
3155          ["ellipse-S"],
3156          ["triangle-S"],
3157          ["triangle-L"],
3158          ["square-S"],
3159          ["circle-S"],
3160          ["square-L"],
3161          ["triangle-L"],
3162          ["circle-L"],
3163          ["ellipse-S"],
3164          ["circle-L"],
3165          ["ellipse-L"],
3166          ["square-S"],
3167          ["circle-L"],
3168          ["square-S"],
3169          ["square-L"],
3170          ["circle-S"],
3171          ["ellipse-L"],
3172          ["triangle-L"],
3173          ["circle-S"],
3174          ["square-L"],
3175          ["circle-S"],
3176         ];
3177 
3178     string fpath_data1x25 = buildPath(testDir, "data1x25.tsv");
3179     string fpath_data1x25_noheader = buildPath(testDir, "data1x25_noheader.tsv");
3180     writeUnittestTsvFile(fpath_data1x25, data1x25);
3181     writeUnittestTsvFile(fpath_data1x25_noheader, data1x25[1 .. $]);
3182 
3183     string[][] data1x25ExpectedDistinctK1P20 =
3184         [["Shape-Size"],
3185          ["triangle-L"],
3186          ["square-S"],
3187          ["triangle-L"],
3188          ["ellipse-L"],
3189          ["square-S"],
3190          ["square-S"],
3191          ["ellipse-L"],
3192          ["triangle-L"],
3193         ];
3194 
3195     string[][] data1x25ExpectedDistinctK1P20Probs =
3196         [["random_value", "Shape-Size"],
3197          ["0", "triangle-L"],
3198          ["0", "square-S"],
3199          ["0", "triangle-L"],
3200          ["0", "ellipse-L"],
3201          ["0", "square-S"],
3202          ["0", "square-S"],
3203          ["0", "ellipse-L"],
3204          ["0", "triangle-L"],
3205         ];
3206 
3207     string[][] data1x25ExpectedDistinctK1P20ProbsInorder =
3208         [["random_value", "Shape-Size"],
3209          ["1", "circle-S"],
3210          ["4", "circle-L"],
3211          ["2", "square-L"],
3212          ["4", "circle-L"],
3213          ["2", "ellipse-S"],
3214          ["1", "triangle-S"],
3215          ["0", "triangle-L"],
3216          ["0", "square-S"],
3217          ["1", "circle-S"],
3218          ["2", "square-L"],
3219          ["0", "triangle-L"],
3220          ["4", "circle-L"],
3221          ["2", "ellipse-S"],
3222          ["4", "circle-L"],
3223          ["0", "ellipse-L"],
3224          ["0", "square-S"],
3225          ["4", "circle-L"],
3226          ["0", "square-S"],
3227          ["2", "square-L"],
3228          ["1", "circle-S"],
3229          ["0", "ellipse-L"],
3230          ["0", "triangle-L"],
3231          ["1", "circle-S"],
3232          ["2", "square-L"],
3233          ["1", "circle-S"],
3234         ];
3235 
3236     /*
3237      * Enough setup! Actually run some tests!
3238      */
3239 
3240     /* Shuffling tests. Headers, static seed, compatibility mode. With weights and without. */
3241     testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty);
3242     testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0);
3243     testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1);
3244     testTsvSample(["test-a4", "-H", "-s", "--compatibility-mode", fpath_data3x2], data3x2PermuteCompat);
3245     testTsvSample(["test-a5", "-H", "-s", "--compatibility-mode", fpath_data3x3], data3x3ExpectedPermuteCompat);
3246     testTsvSample(["test-a6", "-H", "-s", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat);
3247     testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
3248     testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3);
3249     testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
3250     testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
3251     testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
3252     testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
3253     testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs);
3254 
3255     /* Shuffling, without compatibility mode, or with both compatibility and printing. */
3256     testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty);
3257     testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0);
3258     testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1);
3259     testTsvSample(["test-aa4", "-H", "-s", fpath_data3x2], data3x2PermuteShuffle);
3260     testTsvSample(["test-aa5", "-H", "-s", fpath_data3x3], data3x3ExpectedPermuteSwap);
3261     testTsvSample(["test-aa6", "-H", "-s", fpath_data3x6], data3x6ExpectedPermuteSwap);
3262     testTsvSample(["test-aa7", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3);
3263     testTsvSample(["test-aa8", "-H", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
3264     testTsvSample(["test-aa9", "-H", "--seed-value", "41", "--print-random", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
3265 
3266     /* Reservoir sampling using Algorithm R.
3267      * (Note: reservoirSamplingViaHeap is tested later in the length-based iteration loops.)
3268      */
3269     testTsvSample(["test-aa10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty);
3270     testTsvSample(["test-aa11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty);
3271     testTsvSample(["test-aa12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x0], data3x0);
3272     testTsvSample(["test-aa13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x0], data3x0);
3273     testTsvSample(["test-aa14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x1], data3x1);
3274     testTsvSample(["test-aa15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x1], data3x1);
3275     testTsvSample(["test-aa16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6);
3276     testTsvSample(["test-aa17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6);
3277     testTsvSample(["test-aa18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5);
3278     testTsvSample(["test-aa19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4);
3279     testTsvSample(["test-aa20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3);
3280     testTsvSample(["test-aa21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2);
3281     testTsvSample(["test-aa22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1);
3282 
3283     /* Inorder versions of Algorithm R tests. */
3284     testTsvSample(["test-ai10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty);
3285     testTsvSample(["test-ai11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty);
3286     testTsvSample(["test-ai12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0);
3287     testTsvSample(["test-ai13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0);
3288     testTsvSample(["test-ai14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1);
3289     testTsvSample(["test-ai15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1);
3290     testTsvSample(["test-ai16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder);
3291     testTsvSample(["test-ai17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder);
3292     testTsvSample(["test-ai18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5Inorder);
3293     testTsvSample(["test-ai19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4Inorder);
3294     testTsvSample(["test-ai20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3Inorder);
3295     testTsvSample(["test-ai21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2Inorder);
3296     testTsvSample(["test-ai22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1Inorder);
3297 
3298     /* Bernoulli sampling cases. */
3299     testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty);
3300     testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0);
3301     testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1);
3302     testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6);
3303     testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6);
3304     testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
3305     testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60);
3306     testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliCompatP60);
3307     testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs);
3308 
3309     /* Bernoulli sampling with probabilities in skip sampling range or preferring skip sampling. */
3310     testTsvSample(["test-ab1", "-H", "--seed-value", "333", "--prob", "0.01", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P01);
3311     testTsvSample(["test-ab2", "-H", "--seed-value", "333", "--prob", "0.02", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P02);
3312     testTsvSample(["test-ab3", "-H", "--seed-value", "333", "--prob", "0.03", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03);
3313     testTsvSample(["test-ab4", "-H", "--seed-value", "333", "--prob", "0.01", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P01);
3314     testTsvSample(["test-ab5", "-H", "--seed-value", "333", "--prob", "0.02", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P02);
3315     testTsvSample(["test-ab6", "-H", "--seed-value", "333", "--prob", "0.03", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P03);
3316     testTsvSample(["test-ab7", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40);
3317 
3318     /* Distinct sampling cases. */
3319     testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty);
3320     testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0);
3321     testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1);
3322     testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6);
3323     testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctK1K3P60);
3324 
3325 
3326     /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling.
3327      * For weighted sampling, use the weighted cases, but with expected using the original ordering.
3328      */
3329     testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
3330     testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
3331     testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
3332                   data3x6ExpectedWt3ProbsInorder);
3333     testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
3334                   data3x6ExpectedWt3V41ProbsInorder);
3335     testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6],
3336                   data3x6ExpectedDistinctK1K3P60Probs);
3337     testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header",
3338                    "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctK1K3P60ProbsRVCustom);
3339     testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6],
3340                   data3x6ExpectedDistinctK2P2ProbsInorder);
3341 
3342     /* Simple random sampling with replacement. */
3343     testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty);
3344     testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty);
3345     testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0);
3346     testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0);
3347     testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplaceNum3);
3348     testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10);
3349     testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77);
3350 
3351     /* Shuffling, compatibility mode, without headers. */
3352     testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1 .. $]);
3353     testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1 .. $]);
3354     testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1 .. $]);
3355     testTsvSample(["test-b4", "-s", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1 .. $]);
3356     testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1 .. $]);
3357     testTsvSample(["test-b6", "-s", "--weight-field", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]);
3358     testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]);
3359     testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]);
3360     testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1 .. $]);
3361 
3362     /* Shuffling, no headers, without compatibility mode, or with printing and compatibility mode. */
3363     testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1 .. $]);
3364     testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1 .. $]);
3365     testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1 .. $]);
3366     testTsvSample(["test-bb4", "-s", fpath_data3x6_noheader], data3x6ExpectedPermuteSwap[1 .. $]);
3367     testTsvSample(["test-bb5", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]);
3368     testTsvSample(["test-bb6", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]);
3369     testTsvSample(["test-bb7", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]);
3370 
3371     /* Reservoir sampling using Algorithm R, no headers. */
3372     testTsvSample(["test-ac10", "--prefer-algorithm-r", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty);
3373     testTsvSample(["test-ac11", "--prefer-algorithm-r", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty);
3374     testTsvSample(["test-ac14", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x1_noheader], data3x1[1 .. $]);
3375     testTsvSample(["test-ac15", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x1_noheader], data3x1[1 .. $]);
3376     testTsvSample(["test-ac16", "--prefer-algorithm-r", "-s", "--num", "7", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]);
3377     testTsvSample(["test-ac17", "--prefer-algorithm-r", "-s", "--num", "6", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]);
3378     testTsvSample(["test-ac18", "--prefer-algorithm-r", "-s", "--num", "5", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5[1 .. $]);
3379     testTsvSample(["test-ac19", "--prefer-algorithm-r", "-s", "--num", "4", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4[1 .. $]);
3380     testTsvSample(["test-ac20", "--prefer-algorithm-r", "-s", "--num", "3", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3[1 .. $]);
3381     testTsvSample(["test-ac21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2[1 .. $]);
3382     testTsvSample(["test-ac22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1[1 .. $]);
3383 
3384     /* Reservoir sampling using Algorithm R, no headers, inorder output. */
3385     testTsvSample(["test-aj10", "--prefer-algorithm-r", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty);
3386     testTsvSample(["test-aj11", "--prefer-algorithm-r", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty);
3387     testTsvSample(["test-aj14", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3388     testTsvSample(["test-aj15", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3389     testTsvSample(["test-aj16", "--prefer-algorithm-r", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]);
3390     testTsvSample(["test-aj17", "--prefer-algorithm-r", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]);
3391     testTsvSample(["test-aj18", "--prefer-algorithm-r", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5Inorder[1 .. $]);
3392     testTsvSample(["test-aj19", "--prefer-algorithm-r", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4Inorder[1 .. $]);
3393     testTsvSample(["test-aj20", "--prefer-algorithm-r", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3Inorder[1 .. $]);
3394     testTsvSample(["test-aj21", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2Inorder[1 .. $]);
3395     testTsvSample(["test-aj22", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1Inorder[1 .. $]);
3396 
3397     /* Bernoulli sampling cases. */
3398     testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1 .. $]);
3399     testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]);
3400     testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]);
3401     testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]);
3402     testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1 .. $]);
3403     testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1 .. $]);
3404 
3405     /* Bernoulli sampling with probabilities in skip sampling range. */
3406     testTsvSample(["test-bb1", "-v", "333", "-p", "0.01", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P01[1 .. $]);
3407     testTsvSample(["test-bb2", "-v", "333", "-p", "0.02", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P02[1 .. $]);
3408     testTsvSample(["test-bb3", "-v", "333", "-p", "0.03", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1 .. $]);
3409     testTsvSample(["test-bb4", "-v", "333", "-p", "0.01", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P01[1 .. $]);
3410     testTsvSample(["test-bb5", "-v", "333", "-p", "0.02", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P02[1 .. $]);
3411     testTsvSample(["test-bb6", "-v", "333", "-p", "0.03", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P03[1 .. $]);
3412     testTsvSample(["test-bb7", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1 .. $]);
3413 
3414     /* Distinct sampling cases. */
3415     testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1 .. $]);
3416     testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]);
3417     testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]);
3418     testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]);
3419 
3420     /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */
3421     testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]);
3422     testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1 .. $]);
3423     testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader],
3424                   data3x6ExpectedDistinctK1K3P60Probs[1 .. $]);
3425     testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader],
3426                   data3x6ExpectedDistinctK2P2ProbsInorder[1 .. $]);
3427 
3428     /* Simple random sampling with replacement. */
3429     testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty);
3430     testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty);
3431     testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplaceNum3[1 .. $]);
3432     testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10[1 .. $]);
3433     testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1 .. $]);
3434 
3435     /* Multi-file tests. */
3436     testTsvSample(["test-c1", "--header", "--static-seed", "--compatibility-mode",
3437                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3438                   combo1ExpectedPermuteCompat);
3439     testTsvSample(["test-c2", "--header", "--static-seed", "--print-random",
3440                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3441                   combo1ExpectedPermuteCompatProbs);
3442     testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3",
3443                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3444                   combo1ExpectedPermuteWt3Probs);
3445     testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", "--compatibility-mode",
3446                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3447                   combo1ExpectedPermuteWt3);
3448     testTsvSample(["test-c5", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4",
3449                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3450                   combo1ExpectedSampleAlgoRNum4);
3451     testTsvSample(["test-c5b", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder",
3452                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3453                   combo1ExpectedSampleAlgoRNum4Inorder);
3454 
3455     /* Multi-file, no headers. */
3456     testTsvSample(["test-c6", "--static-seed", "--compatibility-mode",
3457                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3458                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3459                   combo1ExpectedPermuteCompat[1 .. $]);
3460     testTsvSample(["test-c7", "--static-seed", "--print-random",
3461                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3462                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3463                   combo1ExpectedPermuteCompatProbs[1 .. $]);
3464     testTsvSample(["test-c8", "--static-seed", "--print-random", "--weight-field", "3",
3465                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3466                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3467                   combo1ExpectedPermuteWt3Probs[1 .. $]);
3468     testTsvSample(["test-c9", "--static-seed", "--weight-field", "3", "--compatibility-mode",
3469                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3470                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3471                   combo1ExpectedPermuteWt3[1 .. $]);
3472     testTsvSample(["test-c10", "--static-seed", "--prefer-algorithm-r", "--num", "4",
3473                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3474                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3475                   combo1ExpectedSampleAlgoRNum4[1 .. $]);
3476     testTsvSample(["test-c10b", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder",
3477                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3478                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3479                   combo1ExpectedSampleAlgoRNum4Inorder[1 .. $]);
3480 
3481     /* Bernoulli sampling cases. */
3482     testTsvSample(["test-c11", "--header", "--static-seed", "--print-random", "--prob", ".5",
3483                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3484                   combo1ExpectedBernoulliCompatP50Probs);
3485     testTsvSample(["test-c12", "--header", "--static-seed", "--prob", ".4",
3486                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3487                   combo1ExpectedBernoulliCompatP40);
3488     testTsvSample(["test-c13", "--static-seed", "--print-random", "--prob", ".5",
3489                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3490                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3491                   combo1ExpectedBernoulliCompatP50Probs[1 .. $]);
3492     testTsvSample(["test-c14", "--static-seed", "--prob", ".4",
3493                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3494                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3495                   combo1ExpectedBernoulliCompatP40[1 .. $]);
3496 
3497     /* Bernoulli sampling with probabilities in skip sampling range. */
3498     testTsvSample(["test-cc1", "-H", "-v", "333", "-p", "0.03",
3499                    fpath_data3x0, fpath_data3x1, fpath_data1x200, fpath_dataEmpty, fpath_data1x10],
3500                   combo2ExpectedBernoulliSkipV333P03);
3501     testTsvSample(["test-cc1", "-v", "333", "-p", "0.03",
3502                    fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader],
3503                   combo2ExpectedBernoulliSkipV333P03[1 .. $]);
3504 
3505     /* Distinct sampling cases. */
3506     testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4",
3507                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3508                   combo1ExpectedDistinctK1P40);
3509     testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--prob", ".4",
3510                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3511                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3512                   combo1ExpectedDistinctK1P40[1 .. $]);
3513 
3514     /* Generating random weights. */
3515     testTsvSample(["test-c15", "--header", "--static-seed", "--gen-random-inorder",
3516                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3517                   combo1ExpectedProbsInorder);
3518     testTsvSample(["test-c16", "--static-seed", "--gen-random-inorder",
3519                    fpath_data3x3_noheader, fpath_data3x1_noheader,
3520                    fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader],
3521                   combo1ExpectedProbsInorder[1 .. $]);
3522 
3523     /* Simple random sampling with replacement. */
3524     testTsvSample(["test-c17", "--header", "--static-seed", "--replace", "--num", "10",
3525                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3526                   combo1ExpectedReplaceNum10);
3527 
3528     testTsvSample(["test-c18", "--static-seed", "--replace", "--num", "10",
3529                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3530                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3531                   combo1ExpectedReplaceNum10[1 .. $]);
3532 
3533     /* Single column file. */
3534     testTsvSample(["test-d1", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat);
3535     testTsvSample(["test-d2", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat);
3536 
3537     /* Distributions. */
3538     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs);
3539     testTsvSample(["test-e2", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedPermuteWt2Probs);
3540     testTsvSample(["test-e3", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedPermuteWt2Probs);
3541     testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs);
3542     testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs);
3543 
3544     /* Tests of subset sample (--n|num) field. Random sampling, Bernoulli sampling, distinct sampling.
3545      *
3546      * Note: The way these tests are done ensures that subset length does not affect
3547      * output order.
3548      */
3549     import std.algorithm : min;
3550     for (size_t n = data3x6.length + 2; n >= 1; n--)
3551     {
3552         /* reservoirSamplingViaHeap.
3553          */
3554         size_t expectedLength = min(data3x6.length, n + 1);
3555         testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string,
3556                        "-H", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]);
3557 
3558         testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string,
3559                        "-H", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]);
3560 
3561         testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string,
3562                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs[0..expectedLength]);
3563 
3564         testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string,
3565                        "-H", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3[0..expectedLength]);
3566 
3567         testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string,
3568                        "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs[0..expectedLength]);
3569 
3570         testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string,
3571                        fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..expectedLength]);
3572 
3573         testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string,
3574                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..expectedLength]);
3575 
3576         testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string,
3577                        "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..expectedLength]);
3578 
3579         testTsvSample([format("test-f9_%d", n), "-s", "-n", n.to!string,
3580                        "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..expectedLength]);
3581 
3582         /* Bernoulli sampling.
3583          */
3584         import std.algorithm : min;
3585         size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedBernoulliCompatProbsP60.length);
3586 
3587         testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3588                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60[0..sampleExpectedLength]);
3589 
3590         testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3591                        "-H", fpath_data3x6], data3x6ExpectedBernoulliCompatP60[0..sampleExpectedLength]);
3592 
3593         testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3594                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..sampleExpectedLength]);
3595 
3596         testTsvSample([format("test-f13_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3597                        fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60[1..sampleExpectedLength]);
3598 
3599         /* Distinct Sampling.
3600          */
3601         size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctK1K3P60.length);
3602 
3603         testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
3604                        "-H", fpath_data3x6], data3x6ExpectedDistinctK1K3P60[0..distinctExpectedLength]);
3605 
3606         testTsvSample([format("test-f15_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
3607                        fpath_data3x6_noheader], data3x6ExpectedDistinctK1K3P60[1..distinctExpectedLength]);
3608 
3609         testTsvSample([format("test-f16_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
3610                        "-H", fpath_data3x6], data3x6ExpectedBernoulliProbsP100[0..expectedLength]);
3611 
3612         testTsvSample([format("test-f17_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
3613                        fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]);
3614     }
3615 
3616     /* Similar tests with the 1x10 data set. */
3617     for (size_t n = data1x10.length + 2; n >= 1; n--)
3618     {
3619         size_t expectedLength = min(data1x10.length, n + 1);
3620         testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string,
3621                        "-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]);
3622 
3623         testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string,
3624                        "-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]);
3625 
3626         testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string,
3627                        fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]);
3628 
3629         testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string,
3630                        "-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]);
3631     }
3632 
3633     /* Simple random sampling with replacement: ensure sample size doesn't change order. */
3634     for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--)
3635     {
3636         testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6],
3637                       data3x6ExpectedReplaceNum10[0 .. n + 1]);
3638 
3639         testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader],
3640                       data3x6ExpectedReplaceNum10[1 .. n + 1]);
3641     }
3642 
3643     /* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */
3644     for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--)
3645     {
3646         size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1);
3647 
3648         testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
3649                        "-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]);
3650 
3651         testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
3652                        fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]);
3653     }
3654 
3655     /* Inorder sampling tests using reservoir sampling via heap (compatibility mode). */
3656     testTsvSample(["test-ar10", "--compatibility-mode", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty);
3657     testTsvSample(["test-ar11", "--compatibility-mode", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty);
3658     testTsvSample(["test-ar12", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0);
3659     testTsvSample(["test-ar13", "--compatibility-mode", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0);
3660     testTsvSample(["test-ar14", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1);
3661     testTsvSample(["test-ar15", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1);
3662     testTsvSample(["test-ar16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder);
3663     testTsvSample(["test-ar17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder);
3664     testTsvSample(["test-ar18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum5Inorder);
3665     testTsvSample(["test-ar19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", fpath_data3x6],         data3x6ExpectedSampleCompatNum4Inorder);
3666     testTsvSample(["test-ar20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum3Inorder);
3667     testTsvSample(["test-ar21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum2Inorder);
3668     testTsvSample(["test-ar22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum1Inorder);
3669 
3670     testTsvSample(["test-as10", "--compatibility-mode", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty);
3671     testTsvSample(["test-as11", "--compatibility-mode", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty);
3672     testTsvSample(["test-as14", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3673     testTsvSample(["test-as15", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3674     testTsvSample(["test-as16", "--compatibility-mode", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]);
3675     testTsvSample(["test-as17", "--compatibility-mode", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]);
3676     testTsvSample(["test-as18", "--compatibility-mode", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5Inorder[1 .. $]);
3677     testTsvSample(["test-as19", "--compatibility-mode", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4Inorder[1 .. $]);
3678     testTsvSample(["test-as20", "--compatibility-mode", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3Inorder[1 .. $]);
3679     testTsvSample(["test-as21", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2Inorder[1 .. $]);
3680     testTsvSample(["test-as22", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1Inorder[1 .. $]);
3681 
3682     /* Inorder sampling tests with random number printing. --compatibility-mode not needed. */
3683     testTsvSample(["test-at16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder);
3684     testTsvSample(["test-at17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder);
3685     testTsvSample(["test-at18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum5ProbsInorder);
3686     testTsvSample(["test-at19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder);
3687     testTsvSample(["test-at19",                         "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder);
3688     testTsvSample(["test-at20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder);
3689     testTsvSample(["test-at20",                         "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder);
3690     testTsvSample(["test-at21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum2ProbsInorder);
3691     testTsvSample(["test-at22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum1ProbsInorder);
3692 
3693     testTsvSample(["test-au16", "--compatibility-mode", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]);
3694     testTsvSample(["test-au17", "--compatibility-mode", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]);
3695     testTsvSample(["test-au18", "--compatibility-mode", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5ProbsInorder[1 .. $]);
3696     testTsvSample(["test-au19", "--compatibility-mode", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]);
3697     testTsvSample(["test-au19",                         "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]);
3698     testTsvSample(["test-au20", "--compatibility-mode", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3ProbsInorder[1 .. $]);
3699     testTsvSample(["test-au21", "--compatibility-mode", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2ProbsInorder[1 .. $]);
3700     testTsvSample(["test-au22", "--compatibility-mode", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1ProbsInorder[1 .. $]);
3701 
3702     /* Inorder weighted sampling tests. */
3703     testTsvSample(["test-ax16", "-H", "-s", "-n", "7", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder);
3704     testTsvSample(["test-ax17", "-H", "-s", "-n", "6", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder);
3705     testTsvSample(["test-ax18", "-H", "-s", "-n", "5", "-i", fpath_data3x6], data3x6ExpectedWt3Num5Inorder);
3706     testTsvSample(["test-ax19", "-H", "-s", "-n", "4", "-i", fpath_data3x6], data3x6ExpectedWt3Num4Inorder);
3707     testTsvSample(["test-ax20", "-H", "-s", "-n", "3", "-i", fpath_data3x6], data3x6ExpectedWt3Num3Inorder);
3708     testTsvSample(["test-ax21", "-H", "-s", "-n", "2", "-i", fpath_data3x6], data3x6ExpectedWt3Num2Inorder);
3709     testTsvSample(["test-ax22", "-H", "-s", "-n", "1", "-i", fpath_data3x6], data3x6ExpectedWt3Num1Inorder);
3710 
3711     testTsvSample(["test-ay16", "-s", "-n", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]);
3712     testTsvSample(["test-ay17", "-s", "-n", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]);
3713     testTsvSample(["test-ay18", "-s", "-n", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num5Inorder[1 .. $]);
3714     testTsvSample(["test-ay19", "-s", "-n", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num4Inorder[1 .. $]);
3715     testTsvSample(["test-ay20", "-s", "-n", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num3Inorder[1 .. $]);
3716     testTsvSample(["test-ay21", "-s", "-n", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num2Inorder[1 .. $]);
3717     testTsvSample(["test-ay22", "-s", "-n", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num1Inorder[1 .. $]);
3718 
3719     /*
3720      * Distinct sampling tests.
3721      */
3722     testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25],
3723                   data5x25ExpectedDistinctK2P40);
3724 
3725     testTsvSample(["test-j2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25],
3726                   data5x25ExpectedDistinctK2K4P20);
3727 
3728     testTsvSample(["test-j3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25],
3729                   data5x25ExpectedDistinctK2K3K4P20);
3730 
3731     testTsvSample(["test-j4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader],
3732                   data5x25ExpectedDistinctK2P40[1 .. $]);
3733 
3734     testTsvSample(["test-j5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader],
3735                   data5x25ExpectedDistinctK2K4P20[1 .. $]);
3736 
3737     testTsvSample(["test-j6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader],
3738                   data5x25ExpectedDistinctK2K3K4P20[1 .. $]);
3739 
3740 
3741     /* These distinct tests check that the whole line as '-k 0' and specifying all fields
3742      * in order have the same result. Also that field numbers don't matter, as '-k 1,2'
3743      * in data2x25 are the same keys as '-k 2,4' in data5x25.
3744      */
3745     testTsvSample(["test-j7", "-H", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25],
3746                   data2x25ExpectedDistinctK1K2P20);
3747 
3748     testTsvSample(["test-j8", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data2x25],
3749                   data2x25ExpectedDistinctK1K2P20);
3750 
3751     testTsvSample(["test-j9", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25_noheader],
3752                   data2x25ExpectedDistinctK1K2P20[1 .. $]);
3753 
3754     testTsvSample(["test-j10", "-s", "-p", "0.20", "-k", "0", fpath_data2x25_noheader],
3755                   data2x25ExpectedDistinctK1K2P20[1 .. $]);
3756 
3757     /* Similar to the last set, but for a 1-column file. Also with random value printing. */
3758     testTsvSample(["test-j11", "-H", "-s", "-p", "0.20", "-k", "1", fpath_data1x25],
3759                   data1x25ExpectedDistinctK1P20);
3760 
3761     testTsvSample(["test-j12", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data1x25],
3762                   data1x25ExpectedDistinctK1P20);
3763 
3764     testTsvSample(["test-j13", "-s", "-p", "0.20", "-k", "1", fpath_data1x25_noheader],
3765                   data1x25ExpectedDistinctK1P20[1 .. $]);
3766 
3767     testTsvSample(["test-j14", "-s", "-p", "0.20", "-k", "0", fpath_data1x25_noheader],
3768                   data1x25ExpectedDistinctK1P20[1 .. $]);
3769 
3770 
3771     testTsvSample(["test-j15", "-H", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25],
3772                   data1x25ExpectedDistinctK1P20Probs);
3773 
3774     testTsvSample(["test-j16", "-H", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25],
3775                   data1x25ExpectedDistinctK1P20Probs);
3776 
3777     testTsvSample(["test-j17", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25_noheader],
3778                   data1x25ExpectedDistinctK1P20Probs[1 .. $]);
3779 
3780     testTsvSample(["test-j18", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25_noheader],
3781                   data1x25ExpectedDistinctK1P20Probs[1 .. $]);
3782 
3783 
3784     testTsvSample(["test-j19", "-H", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25],
3785                   data1x25ExpectedDistinctK1P20ProbsInorder);
3786 
3787     testTsvSample(["test-j20", "-H", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25],
3788                   data1x25ExpectedDistinctK1P20ProbsInorder);
3789 
3790     testTsvSample(["test-j21", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25_noheader],
3791                   data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]);
3792 
3793     testTsvSample(["test-j22", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25_noheader],
3794                   data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]);
3795 
3796 }