tsv_utils.tsv_sample source code

1 /**
2 Command line tool for shuffling or sampling lines from input streams. Several methods
3 are available, including weighted and unweighted shuffling, simple and weighted random
4 sampling, sampling with replacement, Bernoulli sampling, and distinct sampling.
5 
6 Copyright (c) 2017-2020, eBay Inc.
7 Initially written by Jon Degenhardt
8 
9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
10 */
11 module tsv_utils.tsv_sample;
12 
13 import std.array : appender, Appender, RefAppender;
14 import std.exception : enforce;
15 import std.format : format;
16 import std.range;
17 import std.stdio;
18 import std.typecons : tuple, Flag;
19 
20 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
21 
22 version(unittest)
23 {
24     // When running unit tests, use main from -main compiler switch.
25 }
26 else
27 {
28     /** Main program.
29      *
30      * Invokes command line argument processing and calls tsvSample to do the real
31      * work. Errors occurring during processing are caught and reported to the user.
32      */
33     int main(string[] cmdArgs)
34     {
35         /* When running in DMD code coverage mode, turn on report merging. */
36         version(D_Coverage) version(DigitalMars)
37         {
38             import core.runtime : dmd_coverSetMerge;
39             dmd_coverSetMerge(true);
40         }
41 
42         TsvSampleOptions cmdopt;
43         const r = cmdopt.processArgs(cmdArgs);
44         if (!r[0]) return r[1];
45         version(LDC_Profile)
46         {
47             import ldc.profile : resetAll;
48             resetAll();
49         }
50         try
51         {
52             import tsv_utils.common.utils : BufferedOutputRange;
53             auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
54 
55             tsvSample(cmdopt, bufferedOutput);
56         }
57         catch (Exception exc)
58         {
59             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
60             return 1;
61         }
62         return 0;
63     }
64 }
65 
66 immutable helpText = q"EOS
67 Synopsis: tsv-sample [options] [file...]
68 
69 Sample input lines or randomize their order. Several modes of operation
70 are available:
71 * Shuffling (the default): All input lines are output in random order. All
72   orderings are equally likely.
73 * Random sampling (--n|num N): A random sample of N lines are selected and
74   written to standard output. By default, selected lines are written in
75   random order. All sample sets and orderings are equally likely. Use
76   --i|inorder to write the selected lines in the original input order.
77 * Weighted random sampling (--n|num N, --w|weight-field F): A weighted
78   sample of N lines is produced. Weights are taken from field F. Lines are
79   output in weighted selection order. Use --i|inorder to write in original
80   input order. Omit --n|num to shuffle all lines (weighted shuffling).
81 * Sampling with replacement (--r|replace, --n|num N): All input lines are
82   read in, then lines are repeatedly selected at random and written out.
83   This continues until N lines are output. Individual lines can be written
84   multiple times. Output continues forever if N is zero or not provided.
85 * Bernoulli sampling (--p|prob P): A random subset of lines is selected
86   based on probability P, a 0.0-1.0 value. This is a streaming operation.
87   A decision is made on each line as it is read. Line order is not changed.
88 * Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled
89   based on the values in the key fields. A subset of keys are chosen based
90   on the inclusion probability (a 'distinct' set of keys). All lines with
91   one of the selected keys are output. Line order is not changed.
92 
93 Use '--help-verbose' for detailed information.
94 
95 Options:
96 EOS";
97 
98 immutable helpTextVerbose = q"EOS
99 Synopsis: tsv-sample [options] [file...]
100 
101 Sample input lines or randomize their order. Several modes of operation
102 are available:
103 * Shuffling (the default): All input lines are output in random order. All
104   orderings are equally likely.
105 * Random sampling (--n|num N): A random sample of N lines are selected and
106   written to standard output. By default, selected lines are written in
107   random order. All sample sets and orderings are equally likely. Use
108   --i|inorder to write the selected lines in the original input order.
109 * Weighted random sampling (--n|num N, --w|weight-field F): A weighted
110   sample of N lines is produced. Weights are taken from field F. Lines are
111   output in weighted selection order. Use --i|inorder to write in original
112   input order. Omit --n|num to shuffle all lines (weighted shuffling).
113 * Sampling with replacement (--r|replace, --n|num N): All input lines are
114   read in, then lines are repeatedly selected at random and written out.
115   This continues until N lines are output. Individual lines can be written
116   multiple times. Output continues forever if N is zero or not provided.
117 * Bernoulli sampling (--p|prob P): A random subset of lines is selected
118   based on probability P, a 0.0-1.0 value. This is a streaming operation.
119   A decision is made on each line as it is read. Line order is not changed.
120 * Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled
121   based on the values in the key fields. A subset of keys are chosen based
122   on the inclusion probability (a 'distinct' set of keys). All lines with
123   one of the selected keys are output. Line order is not changed.
124 
125 Sample size: The '--n|num' option controls the sample size for all
126 sampling methods. In the case of simple and weighted random sampling it
127 also limits the amount of memory required.
128 
129 Controlling the random seed: By default, each run produces a different
130 randomization or sampling. Using '--s|static-seed' changes this so
131 multiple runs produce the same results. This works by using the same
132 random seed each run. The random seed can be specified using
133 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero
134 value is a no-op and ignored.)
135 
136 Memory use: Bernoulli sampling and distinct sampling make decisions on
137 each line as it is read, there is no memory accumulation. These algorithms
138 can run on arbitrary size inputs. Sampling with replacement reads all
139 lines into memory and is limited by available memory. Shuffling also reads
140 all lines into memory and is similarly limited. Random sampling uses
141 reservoir sampling, and only needs to hold the sample size (--n|num) in
142 memory. The input data can be of any length.
143 
144 Weighted sampling: Weighted random sampling is done using an algorithm
145 described by Pavlos Efraimidis and Paul Spirakis. Weights should be
146 positive values representing the relative weight of the entry in the
147 collection. Counts and similar can be used as weights, it is *not*
148 necessary to normalize to a [0,1] interval. Negative values are not
149 meaningful and given the value zero. Input order is not retained, instead
150 lines are output ordered by the randomized weight that was assigned. This
151 means that a smaller valid sample can be produced by taking the first N
152 lines of output. For more info on the sampling approach see:
153 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling
154 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis
155   (https://arxiv.org/abs/1012.0256)
156 
157 Printing random values: Most of the sampling algorithms work by generating
158 a random value for each line. (See "Compatibility mode" below.) The nature
159 of these values depends on the sampling algorithm. They are used for both
160 line selection and output ordering. The '--p|print-random' option can be
161 used to print these values. The random value is prepended to the line
162 separated by the --d|delimiter char (TAB by default). The
163 '--gen-random-inorder' option takes this one step further, generating
164 random values for all input lines without changing the input order. The
165 types of values currently used by these sampling algorithms:
166 * Unweighted sampling: Uniform random value in the interval [0,1]. This
167   includes Bernoulli sampling and unweighted line order randomization.
168 * Weighted sampling: Value in the interval [0,1]. Distribution depends on
169   the values in the weight field. It is used as a partial ordering.
170 * Distinct sampling: An integer, zero and up, representing a selection
171   group. The inclusion probability determines the number of selection groups.
172 * Sampling with replacement: Random value printing is not supported.
173 
174 The specifics behind these random values are subject to change in future
175 releases.
176 
177 Compatibility mode: As described above, many of the sampling algorithms
178 assign a random value to each line. This is useful when printing random
179 values. It has another occasionally useful property: repeated runs with
180 the same static seed but different selection parameters are more
181 compatible with each other, as each line gets assigned the same random
182 value on every run. For example, if Bernoulli sampling is run with
183 '--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed',
184 all the lines selected in the first run will be selected in the second.
185 This comes at a cost: in some cases there are faster algorithms that don't
186 preserve this property. By default, tsv-sample will use faster algorithms
187 when available. However, the '--compatibility-mode' option switches to
188 algorithms that assign a random value per line. Printing random values
189 also engages compatibility mode.
190 
191 Options:
192 EOS";
193 
194 /** Container for command line options and derived data.
195  *
196  * TsvSampleOptions handles several aspects of command line options. On the input side,
197  * it defines the command line options available, performs validation, and sets up any
198  * derived state based on the options provided. These activities are handled by the
199  * processArgs() member.
200  *
201  * Once argument processing is complete, TsvSampleOptions is used as a container
202  * holding the specific processing options used by the different sampling routines.
203  */
204 struct TsvSampleOptions
205 {
206     import tsv_utils.common.utils : InputSourceRange;
207 
208     string programName;                        /// Program name
209     InputSourceRange inputSources;             /// Input files
210     bool helpVerbose = false;                  /// --help-verbose
211     bool hasHeader = false;                    /// --H|header
212     ulong sampleSize = 0;                      /// --n|num - Size of the desired sample
213     double inclusionProbability = double.nan;  /// --p|prob - Inclusion probability
214     size_t[] keyFields;                        /// --k|key-fields - Used with inclusion probability
215     size_t weightField = 0;                    /// --w|weight-field - Field holding the weight
216     bool srsWithReplacement = false;           /// --r|replace
217     bool preserveInputOrder = false;           /// --i|inorder
218     bool staticSeed = false;                   /// --s|static-seed
219     uint seedValueOptionArg = 0;               /// --v|seed-value
220     bool printRandom = false;                  /// --print-random
221     bool genRandomInorder = false;             /// --gen-random-inorder
222     string randomValueHeader = "random_value"; /// --random-value-header
223     bool compatibilityMode = false;            /// --compatibility-mode
224     char delim = '\t';                         /// --d|delimiter
225     bool versionWanted = false;                /// --V|version
226     bool preferSkipSampling = false;           /// --prefer-skip-sampling
227     bool preferAlgorithmR = false;             /// --prefer-algorithm-r
228     bool hasWeightField = false;               /// Derived.
229     bool useBernoulliSampling = false;         /// Derived.
230     bool useDistinctSampling = false;          /// Derived.
231     bool distinctKeyIsFullLine = false;        /// Derived. True if '--k|key-fields 0' is specfied.
232     bool usingUnpredictableSeed = true;        /// Derived from --static-seed, --seed-value
233     uint seed = 0;                             /// Derived from --static-seed, --seed-value
234 
235     /** Process tsv-sample command line arguments.
236      *
237      * Defines the command line options, performs validation, and derives additional
238      * state. std.getopt.getopt is called to do the main option processing followed
239      * additional validation and derivation.
240      *
241      * Help text is printed to standard output if help was requested. Error text is
242      * written to stderr if invalid input is encountered.
243      *
244      * A tuple is returned. First value is true if command line arguments were
245      * successfully processed and execution should continue, or false if an error
246      * occurred or the user asked for help. If false, the second value is the
247      * appropriate exit code (0 or 1).
248      *
249      * Returning true (execution continues) means args have been validated and derived
250      * values calculated. Field indices will have been converted to zero-based.
251      */
252     auto processArgs(ref string[] cmdArgs)
253     {
254         import std.algorithm : all, canFind, each;
255         import std.getopt;
256         import std.math : isNaN;
257         import std.path : baseName, stripExtension;
258         import std.typecons : Yes, No;
259         import tsv_utils.common.utils : inputSourceRange, makeFieldListOptionHandler, ReadHeader;
260 
261         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
262 
263         try
264         {
265             arraySep = ",";    // Use comma to separate values in command line options
266             auto r = getopt(
267                 cmdArgs,
268                 "help-verbose",    "     Print more detailed help.", &helpVerbose,
269 
270                 std.getopt.config.caseSensitive,
271                 "H|header",        "     Treat the first line of each file as a header.", &hasHeader,
272                 std.getopt.config.caseInsensitive,
273 
274                 "n|num",           "NUM  Maximum number of lines to output. All selected lines are output if not provided or zero.", &sampleSize,
275                 "p|prob",          "NUM  Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability,
276 
277                 "k|key-fields",    "<field-list>  Fields to use as key for distinct sampling. Use with '--p|prob'. Specify '--k|key-fields 0' to use the entire line as the key.",
278                 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
279 
280                 "w|weight-field",  "NUM  Field containing weights. All lines get equal weight if not provided or zero.", &weightField,
281                 "r|replace",       "     Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement,
282                 "i|inorder",       "     Output random samples in original input order. Requires use of --n|num.", &preserveInputOrder,
283                 "s|static-seed",   "     Use the same random seed every run.", &staticSeed,
284 
285                 std.getopt.config.caseSensitive,
286                 "v|seed-value",    "NUM  Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg,
287                 std.getopt.config.caseInsensitive,
288 
289                 "print-random",       "     Include the assigned random value (prepended) when writing output lines.", &printRandom,
290                 "gen-random-inorder", "     Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder,
291                 "random-value-header",  "     Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader,
292                 "compatibility-mode", "     Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode,
293 
294                 "d|delimiter",     "CHR  Field delimiter.", &delim,
295 
296                 std.getopt.config.caseSensitive,
297                 "V|version",       "     Print version information and exit.", &versionWanted,
298                 std.getopt.config.caseInsensitive,
299 
300                 "prefer-skip-sampling", "     (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.",
301                 &preferSkipSampling,
302 
303                 "prefer-algorithm-r",   "     (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.",
304                 &preferAlgorithmR,
305                 );
306 
307             if (r.helpWanted)
308             {
309                 defaultGetoptPrinter(helpText, r.options);
310                 return tuple(false, 0);
311             }
312             else if (helpVerbose)
313             {
314                 defaultGetoptPrinter(helpTextVerbose, r.options);
315                 return tuple(false, 0);
316             }
317             else if (versionWanted)
318             {
319                 import tsv_utils.common.tsvutils_version;
320                 writeln(tsvutilsVersionNotice("tsv-sample"));
321                 return tuple(false, 0);
322             }
323 
324             /* Input files. Remaining command line args are files. */
325             string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
326             cmdArgs.length = 1;
327             ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
328             inputSources = inputSourceRange(filepaths, readHeader);
329 
330             /* Derivations and validations. */
331             if (weightField > 0)
332             {
333                 hasWeightField = true;
334                 weightField--;    // Switch to zero-based indexes.
335             }
336 
337             if (srsWithReplacement)
338             {
339                 enforce(!hasWeightField,
340                         "Sampling with replacement (--r|replace) does not support weights (--w|weight-field).");
341 
342                 enforce(inclusionProbability.isNaN,
343                         "Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob).");
344 
345                 enforce(keyFields.length == 0,
346                         "Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields).");
347 
348                 enforce(!printRandom && !genRandomInorder,
349                         "Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder).");
350 
351                 enforce(!preserveInputOrder,
352                         "Sampling with replacement (--r|replace) does not support input order preservation (--i|inorder option).");
353             }
354 
355             if (keyFields.length > 0)
356             {
357                 /* Note: useDistinctSampling is set as part of the inclusion probability checks below. */
358 
359                 enforce(!inclusionProbability.isNaN, "--p|prob is required when using --k|key-fields.");
360 
361                 if (keyFields.length == 1 && keyFields[0] == 0)
362                 {
363                     distinctKeyIsFullLine = true;
364                 }
365                 else
366                 {
367                     enforce(keyFields.length <= 1 || keyFields.all!(x => x != 0),
368                             "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.");
369 
370                     keyFields.each!((ref x) => --x);  // Convert to zero-based indexing.
371                 }
372             }
373 
374             /* Inclusion probability (--p|prob) is used for both Bernoulli sampling and distinct sampling. */
375             if (!inclusionProbability.isNaN)
376             {
377                 enforce(inclusionProbability > 0.0 && inclusionProbability <= 1.0,
378                         format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability));
379 
380                 if (keyFields.length > 0) useDistinctSampling = true;
381                 else useBernoulliSampling = true;
382 
383                 enforce(!hasWeightField, "--w|weight-field and --p|prob cannot be used together.");
384 
385                 enforce(!genRandomInorder || useDistinctSampling,
386                         "--gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used." ~
387                         "\nUse --gen-random-inorder alone to print probabilities for all lines." ~
388                         "\nUse --p|prob and --print-random to print probabilities for lines satisfying the probability threshold.");
389             }
390             else if (genRandomInorder && !hasWeightField)
391             {
392                 useBernoulliSampling = true;
393             }
394 
395             enforce(randomValueHeader.length != 0 && !randomValueHeader.canFind('\n') &&
396                     !randomValueHeader.canFind(delim),
397                     "--randomValueHeader must be at least one character and not contain field delimiters or newlines.");
398 
399             /* Check for incompatible use of (--i|inorder) and shuffling of the full
400              * data set. Sampling with replacement is also incompatible, this is
401              * detected earlier. Shuffling is the default operation, so it identified
402              * by eliminating the other modes of operation.
403              */
404             enforce(!preserveInputOrder ||
405                     sampleSize != 0 ||
406                     useBernoulliSampling ||
407                     useDistinctSampling,
408                     "Preserving input order (--i|inorder) is not compatible with full data set shuffling. Switch to random sampling with a sample size (--n|num) to use --i|inorder.");
409 
410             /* Compatibility mode checks:
411              * - Random value printing implies compatibility-mode, otherwise user's
412              *   selection is used.
413              * - Distinct sampling doesn't support compatibility-mode. The routines
414              *   don't care, but users might expect larger probabilities to be a
415              *   superset of smaller probabilities. This would be confusing, so
416              *   flag it as an error.
417              */
418             enforce(!(compatibilityMode && useDistinctSampling),
419                     "Distinct sampling (--k|key-fields --p|prob) does not support --compatibility-mode.");
420 
421             if (printRandom || genRandomInorder) compatibilityMode = true;
422 
423             /* Seed. */
424             import std.random : unpredictableSeed;
425 
426             usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0);
427 
428             if (usingUnpredictableSeed) seed = unpredictableSeed;
429             else if (seedValueOptionArg != 0) seed = seedValueOptionArg;
430             else if (staticSeed) seed = 2438424139;
431             else assert(0, "Internal error, invalid seed option states.");
432         }
433         catch (Exception exc)
434         {
435             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
436             return tuple(false, 1);
437         }
438         return tuple(true, 0);
439     }
440 }
441 /** Invokes the appropriate sampling routine based on the command line arguments.
442  *
443  * tsvSample is the top-level routine handling the different tsv-sample use cases.
444  * Its primary role is to invoke the correct routine for type of sampling requested.
445  */
446 void tsvSample(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
447 if (isOutputRange!(OutputRange, char))
448 {
449     if (cmdopt.srsWithReplacement)
450     {
451         simpleRandomSamplingWithReplacement(cmdopt, outputStream);
452     }
453     else if (cmdopt.useBernoulliSampling)
454     {
455         bernoulliSamplingCommand(cmdopt, outputStream);
456     }
457     else if (cmdopt.useDistinctSampling)
458     {
459         if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
460         else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream);
461     }
462     else if (cmdopt.genRandomInorder)
463     {
464         /* Note that the preceding cases handle gen-random-inorder themselves (Bernoulli,
465          * Distinct), or don't handle it (SRS w/ Replacement).
466          */
467         assert(cmdopt.hasWeightField);
468         generateWeightedRandomValuesInorder(cmdopt, outputStream);
469     }
470     else if (cmdopt.sampleSize != 0)
471     {
472         randomSamplingCommand(cmdopt, outputStream);
473     }
474     else
475     {
476         shuffleCommand(cmdopt, outputStream);
477     }
478 }
479 
480 /** Bernoulli sampling command handler. Invokes the appropriate Bernoulli sampling
481  * routine based on the command line arguments.
482  *
483  * This routine selects the appropriate Bernoulli sampling function and template
484  * instantiation to use based on the command line arguments.
485  *
486  * One of the basic choices is whether to use the vanilla algorithm or skip sampling.
487  * Skip sampling is a little bit faster when the inclusion probability is small but
488  * doesn't support compatibility mode. See the bernoulliSkipSampling documentation
489  * for a discussion of the skipSamplingProbabilityThreshold used here.
490  */
491 void bernoulliSamplingCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
492 if (isOutputRange!(OutputRange, char))
493 {
494     assert(!cmdopt.hasWeightField);
495 
496     immutable double skipSamplingProbabilityThreshold = 0.04;
497 
498     if (cmdopt.compatibilityMode ||
499         (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling))
500     {
501         if (cmdopt.genRandomInorder)
502         {
503             bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
504         }
505         else
506         {
507             bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream);
508         }
509     }
510     else
511     {
512         bernoulliSkipSampling(cmdopt, outputStream);
513     }
514 }
515 
516 /** Bernoulli sampling of lines from the input stream.
517  *
518  * Each input line is a assigned a random value and output if less than
519  * cmdopt.inclusionProbability. The order of the lines is not changed.
520  *
521  * This routine supports random value printing and gen-random-inorder value printing.
522  */
523 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
524     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
525 if (isOutputRange!(OutputRange, char))
526 {
527     import std.random : Random = Mt19937, uniform01;
528     import tsv_utils.common.utils : bufferedByLine, InputSourceRange, throwIfWindowsNewlineOnUnix;
529 
530     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
531     else assert(!cmdopt.genRandomInorder);
532 
533     assert(!cmdopt.inputSources.empty);
534     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
535 
536     auto randomGenerator = Random(cmdopt.seed);
537 
538     /* First header is read during command line argument processing. */
539     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
540     {
541         auto inputStream = cmdopt.inputSources.front;
542         throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
543 
544         static if (generateRandomAll)
545         {
546             outputStream.put(cmdopt.randomValueHeader);
547             outputStream.put(cmdopt.delim);
548         }
549         else if (cmdopt.printRandom)
550         {
551             outputStream.put(cmdopt.randomValueHeader);
552             outputStream.put(cmdopt.delim);
553         }
554 
555         outputStream.put(inputStream.header);
556         outputStream.put("\n");
557     }
558 
559     /* Process each line. */
560     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
561     ulong numLinesWritten = 0;
562 
563     foreach (inputStream; cmdopt.inputSources)
564     {
565         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
566 
567         foreach (ulong fileLineNum, line;
568                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
569         {
570             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
571 
572             immutable double lineScore = uniform01(randomGenerator);
573 
574             static if (generateRandomAll)
575             {
576                 outputStream.formatRandomValue(lineScore);
577                 outputStream.put(cmdopt.delim);
578                 outputStream.put(line);
579                 outputStream.put("\n");
580 
581                 if (cmdopt.sampleSize != 0)
582                 {
583                     ++numLinesWritten;
584                     if (numLinesWritten == cmdopt.sampleSize) return;
585                 }
586             }
587             else if (lineScore < cmdopt.inclusionProbability)
588             {
589                 if (cmdopt.printRandom)
590                 {
591                     outputStream.formatRandomValue(lineScore);
592                     outputStream.put(cmdopt.delim);
593                 }
594                 outputStream.put(line);
595                 outputStream.put("\n");
596 
597                 if (cmdopt.sampleSize != 0)
598                 {
599                     ++numLinesWritten;
600                     if (numLinesWritten == cmdopt.sampleSize) return;
601                 }
602             }
603         }
604     }
605 }
606 
607 /** bernoulliSkipSampling is an implementation of Bernoulli sampling using skips.
608  *
609  * Skip sampling works by skipping a random number of lines between selections. This
610  * can be faster than assigning a random value to each line when the inclusion
611  * probability is low, as it reduces the number of calls to the random number
612  * generator. Both the random number generator and the log() function are called when
613  * calculating the next skip size. These additional log() calls add up as the
614  * inclusion probability increases.
615  *
616  * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for
617  * file-oriented line sampling. This is obviously environment specific. In the
618  * environments this implementation has been tested in the performance improvements
619  * remain small, less than 7%, even with an inclusion probability as low as 0.0001.
620  *
621  * The algorithm does not assign random values to individual lines. This makes it
622  * incompatible with random value printing. It is not suitable for compatibility mode
623  * either. As an example, in compatibility mode a line selected with '--prob 0.2' should
624  * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling
625  * does not have this property.
626  *
627  * The algorithm for calculating the skip size has been described by multiple sources.
628  * There are two key variants depending on whether the total number of lines in the
629  * data set is known in advance. (This implementation does not know the total.)
630  * Useful references:
631  * $(LIST
632  *     * Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling",
633  *       ACM Trans on Mathematical Software, 1987. On-line:
634  *       http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf
635  *     * P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book
636  *       "Data Stream Management", Springer-Verlag, 2016. On-line:
637  *       https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf
638  *     * Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line:
639  *       http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/
640  * )
641  */
642 void bernoulliSkipSampling(OutputRange)(ref TsvSampleOptions cmdopt, OutputRange outputStream)
643     if (isOutputRange!(OutputRange, char))
644 {
645     import std.conv : to;
646     import std.math : log, trunc;
647     import std.random : Random = Mt19937, uniform01;
648     import tsv_utils.common.utils : bufferedByLine, InputSourceRange, throwIfWindowsNewlineOnUnix;
649 
650     assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0);
651     assert(!cmdopt.printRandom);
652     assert(!cmdopt.compatibilityMode);
653 
654     assert(!cmdopt.inputSources.empty);
655     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
656 
657     auto randomGenerator = Random(cmdopt.seed);
658 
659     immutable double discardRate = 1.0 - cmdopt.inclusionProbability;
660     immutable double logDiscardRate = log(discardRate);
661 
662     /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed
663      * interval to (0.0, 1.0], excluding 0.0.
664      */
665     size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
666 
667     /* First header is read during command line argument processing. */
668     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
669     {
670         auto inputStream = cmdopt.inputSources.front;
671         throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
672 
673         outputStream.put(inputStream.header);
674         outputStream.put("\n");
675     }
676 
677     /* Process each line. */
678     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
679     ulong numLinesWritten = 0;
680     foreach (inputStream; cmdopt.inputSources)
681     {
682         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
683 
684         foreach (ulong fileLineNum, line;
685                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
686         {
687             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
688 
689             if (remainingSkips > 0)
690             {
691                 --remainingSkips;
692             }
693             else
694             {
695                 outputStream.put(line);
696                 outputStream.put("\n");
697 
698                 if (cmdopt.sampleSize != 0)
699                 {
700                     ++numLinesWritten;
701                     if (numLinesWritten == cmdopt.sampleSize) return;
702                 }
703 
704                 remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
705             }
706         }
707     }
708 }
709 
710 /** Sample lines by choosing a random set of distinct keys formed from one or more
711  * fields on each line.
712  *
713  * Distinct sampling is a streaming form of sampling, similar to Bernoulli sampling.
714  * However, instead of each line being subject to an independent trial, lines are
715  * selected based on a key from each line. A portion of keys are randomly selected for
716  * output, and every line containing a selected key is included in the output.
717  *
718  * An example use-case is a query log having <user, query, clicked-url> triples. It is
719  * often useful to sample records for portion of the users, but including all records
720  * for the users selected. Distinct sampling supports this by selecting a subset of
721  * users to include in the output.
722  *
723  * Distinct sampling is done by hashing the key and mapping the hash value into
724  * buckets sized to hold the inclusion probability. Records having a key mapping to
725  * bucket zero are output. Buckets are equal size and therefore may be larger than the
726  * inclusion probability. (The other approach would be to have the caller specify the
727  * the number of buckets. More correct, but less convenient.)
728  */
729 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
730     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
731 if (isOutputRange!(OutputRange, char))
732 {
733     import std.algorithm : splitter;
734     import std.conv : to;
735     import std.digest.murmurhash;
736     import std.math : lrint;
737     import tsv_utils.common.utils : bufferedByLine, InputFieldReordering,
738         InputSourceRange, throwIfWindowsNewlineOnUnix;
739 
740     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
741     else assert(!cmdopt.genRandomInorder);
742 
743     assert(cmdopt.keyFields.length > 0);
744     assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0);
745 
746     assert(!cmdopt.inputSources.empty);
747     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
748 
749     static if (generateRandomAll)
750     {
751         import std.format : formatValue, singleSpec;
752         immutable randomValueFormatSpec = singleSpec("%d");
753     }
754 
755     immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys.
756 
757     uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint;
758 
759     /* Create a mapping for the key fields. */
760     auto keyFieldsReordering = cmdopt.distinctKeyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields);
761 
762     /* First header is read during command line argument processing. */
763     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
764     {
765         auto inputStream = cmdopt.inputSources.front;
766         throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
767 
768         static if (generateRandomAll)
769         {
770             outputStream.put(cmdopt.randomValueHeader);
771             outputStream.put(cmdopt.delim);
772         }
773         else if (cmdopt.printRandom)
774         {
775             outputStream.put(cmdopt.randomValueHeader);
776             outputStream.put(cmdopt.delim);
777         }
778 
779         outputStream.put(inputStream.header);
780         outputStream.put("\n");
781     }
782 
783     /* Process each line. */
784     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
785     ulong numLinesWritten = 0;
786 
787     foreach (inputStream; cmdopt.inputSources)
788     {
789         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
790 
791         foreach (ulong fileLineNum, line;
792                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
793         {
794             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
795 
796             /* Murmurhash works by successively adding individual keys, then finalizing.
797              * Adding individual keys is simpler if the full-line-as-key and individual
798              * fields as keys cases are separated.
799              */
800             auto hasher = MurmurHash3!32(cmdopt.seed);
801 
802             if (cmdopt.distinctKeyIsFullLine)
803             {
804                 hasher.put(cast(ubyte[]) line);
805             }
806             else
807             {
808                 assert(keyFieldsReordering !is null);
809 
810                 /* Gather the key field values and assemble the key. */
811                 keyFieldsReordering.initNewLine;
812                 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
813                 {
814                     keyFieldsReordering.processNextField(fieldIndex, fieldValue);
815                     if (keyFieldsReordering.allFieldsFilled) break;
816                 }
817 
818                 enforce(keyFieldsReordering.allFieldsFilled,
819                         format("Not enough fields in line. File: %s, Line: %s",
820                                inputStream.name, fileLineNum));
821 
822                 foreach (count, key; keyFieldsReordering.outputFields.enumerate)
823                 {
824                     if (count > 0) hasher.put(delimArray);
825                     hasher.put(cast(ubyte[]) key);
826                 }
827             }
828 
829             hasher.finish;
830 
831             static if (generateRandomAll)
832             {
833                 import std.conv : to;
834                 outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec);
835                 outputStream.put(cmdopt.delim);
836                 outputStream.put(line);
837                 outputStream.put("\n");
838 
839                 if (cmdopt.sampleSize != 0)
840                 {
841                     ++numLinesWritten;
842                     if (numLinesWritten == cmdopt.sampleSize) return;
843                 }
844             }
845             else if (hasher.get % numBuckets == 0)
846             {
847                 if (cmdopt.printRandom)
848                 {
849                     outputStream.put('0');
850                     outputStream.put(cmdopt.delim);
851                 }
852                 outputStream.put(line);
853                 outputStream.put("\n");
854 
855                 if (cmdopt.sampleSize != 0)
856                 {
857                     ++numLinesWritten;
858                     if (numLinesWritten == cmdopt.sampleSize) return;
859                 }
860             }
861         }
862     }
863 }
864 
865 /** Random sampling command handler. Invokes the appropriate sampling routine based on
866  * the command line arguments.
867  *
868  * Random sampling selects a fixed size random sample from the input stream. Both
869  * simple random sampling (equal likelihood) and weighted random sampling are
870  * supported. Selected lines are output either in random order or original input order.
871  * For weighted sampling the random order is the weighted selection order.
872  *
873  * Two algorithms are used, reservoir sampling via a heap and reservoir sampling via
874  * Algorithm R. This routine selects the appropriate reservoir sampling function and
875  * template instantiation to based on the command line arguments.
876  *
877  * Weighted sampling always uses the heap approach. Compatibility mode does as well,
878  * as it is the method that uses per-line random value assignments. The implication
879  * of compatibility mode is that a larger sample size includes all the results from
880  * a smaller sample, assuming the same random seed is used.
881  *
882  * For unweighted sampling there is a performance tradeoff between implementations.
883  * Heap-based sampling is faster for small sample sizes. Algorithm R is faster for
884  * large sample sizes. The threshold used was chosen based on performance tests. See
885  * the reservoirSamplingAlgorithmR documentation for more information.
886  */
887 
888 void randomSamplingCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
889 if (isOutputRange!(OutputRange, char))
890 {
891     assert(cmdopt.sampleSize != 0);
892 
893     immutable size_t algorithmRSampleSizeThreshold = 128 * 1024;
894 
895     if (cmdopt.hasWeightField)
896     {
897         if (cmdopt.preserveInputOrder)
898         {
899             reservoirSamplingViaHeap!(Yes.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream);
900         }
901         else
902         {
903             reservoirSamplingViaHeap!(Yes.isWeighted, No.preserveInputOrder)(cmdopt, outputStream);
904         }
905     }
906     else if (cmdopt.compatibilityMode ||
907              (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR))
908     {
909         if (cmdopt.preserveInputOrder)
910         {
911             reservoirSamplingViaHeap!(No.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream);
912         }
913         else
914         {
915             reservoirSamplingViaHeap!(No.isWeighted, No.preserveInputOrder)(cmdopt, outputStream);
916         }
917     }
918     else if (cmdopt.preserveInputOrder)
919     {
920         reservoirSamplingAlgorithmR!(Yes.preserveInputOrder)(cmdopt, outputStream);
921     }
922     else
923     {
924         reservoirSamplingAlgorithmR!(No.preserveInputOrder)(cmdopt, outputStream);
925     }
926 }
927 
928 /** Reservoir sampling using a heap. Both weighted and unweighted random sampling are
929  * supported.
930  *
931  * The algorithm used here is based on the one-pass algorithm described by Pavlos
932  * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S.
933  * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are
934  * simply set to one.
935  *
936  * The implementation uses a heap (priority queue) large enough to hold the desired
937  * number of lines. Input is read line-by-line, assigned a random value, and added to
938  * the heap. The role of the heap is to identify the lines with the highest assigned
939  * random values. Once the heap is full, adding a new line means dropping the line with
940  * the lowest score. A "min" heap used for this reason.
941  *
942  * When done reading all lines, the "min" heap is in reverse of weighted selection
943  * order. Weighted selection order is obtained by removing each element one at at time
944  * from the heap. The underlying data store will have the elements in weighted selection
945  * order (largest weights first).
946  *
947  * Generating output in weighted order is useful for several reasons:
948  *  - For weighted sampling, it preserves the property that smaller valid subsets can be
949  *    created by taking the first N lines.
950  *  - For unweighted sampling, it ensures that all output permutations are possible, and
951  *    are not influenced by input order or the heap data structure used.
952  *  - Order consistency is maintained when making repeated use of the same random seed,
953  *    but with different sample sizes.
954  *
955  * The other choice is preserving input order. This is supporting by recording line
956  * numbers and sorting the selected sample.
957  *
958  * There are use cases where only the selection set matters. For these some performance
959  * could be gained by skipping the reordering and simply printing the backing store
960  * array in-order. Performance tests indicate only a minor benefit, so this is not
961  * supported.
962  *
963  * Notes:
964  * $(LIST
965  *    * In tsv-sample versions 1.2.1 and earlier this routine also supported
966  *      randomization of all input lines. This was dropped in version 1.2.2 in favor
967  *      of the approach used in randomizeLines. The latter has significant advantages
968  *      given that all data must be read into memory.
969  *    * For large reservoir sizes better performance can be achieved using Algorithm R.
970  *      See the reservoirSamplingAlgorithmR documentation for details.
971  * )
972  */
973 void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, Flag!"preserveInputOrder" preserveInputOrder, OutputRange)
974     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
975 if (isOutputRange!(OutputRange, char))
976 {
977     import std.algorithm : sort;
978     import std.container.array;
979     import std.container.binaryheap;
980     import std.meta : AliasSeq;
981     import std.random : Random = Mt19937, uniform01;
982     import tsv_utils.common.utils : bufferedByLine, InputSourceRange, throwIfWindowsNewlineOnUnix;
983 
984     static if (isWeighted) assert(cmdopt.hasWeightField);
985     else assert(!cmdopt.hasWeightField);
986 
987     assert(cmdopt.sampleSize > 0);
988 
989     assert(!cmdopt.inputSources.empty);
990     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
991 
992     auto randomGenerator = Random(cmdopt.seed);
993 
994     static struct Entry(Flag!"preserveInputOrder" preserveInputOrder)
995     {
996         double score;
997         const(char)[] line;
998         static if (preserveInputOrder) ulong lineNumber;
999     }
1000 
1001     /* Create the heap and backing data store.
1002      *
1003      * Note: An std.container.array is used as the backing store to avoid some issues in
1004      * the standard library (Phobos) binaryheap implementation. Specifically, when an
1005      * std.container.array is used as backing store, the heap can efficiently reversed by
1006      * removing the heap elements. This leaves the backing store in the reversed order.
1007      * However, the current binaryheap implementation does not support this for all
1008      * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094.
1009      */
1010 
1011     Array!(Entry!preserveInputOrder) dataStore;
1012     dataStore.reserve(cmdopt.sampleSize);
1013     auto reservoir = dataStore.heapify!("a.score > b.score")(0);  // Min binaryheap
1014 
1015     /* First header is read during command line argument processing. */
1016     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1017     {
1018         auto inputStream = cmdopt.inputSources.front;
1019         throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1020 
1021         if (cmdopt.printRandom)
1022         {
1023             outputStream.put(cmdopt.randomValueHeader);
1024             outputStream.put(cmdopt.delim);
1025         }
1026         outputStream.put(inputStream.header);
1027         outputStream.put("\n");
1028     }
1029 
1030     /* Process each line. */
1031     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
1032     static if (preserveInputOrder) ulong totalLineNum = 0;
1033 
1034     foreach (inputStream; cmdopt.inputSources)
1035     {
1036         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1037 
1038         foreach (ulong fileLineNum, line;
1039                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
1040         {
1041             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
1042 
1043             static if (!isWeighted)
1044             {
1045                 immutable double lineScore = uniform01(randomGenerator);
1046             }
1047             else
1048             {
1049                 immutable double lineWeight =
1050                     getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, inputStream.name, fileLineNum);
1051                 immutable double lineScore =
1052                     (lineWeight > 0.0)
1053                     ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1054                     : 0.0;
1055             }
1056 
1057             static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum);
1058             else alias entryCTArgs = AliasSeq!();
1059 
1060             if (reservoir.length < cmdopt.sampleSize)
1061             {
1062                 reservoir.insert(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs));
1063             }
1064             else if (reservoir.front.score < lineScore)
1065             {
1066                 reservoir.replaceFront(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs));
1067             }
1068 
1069             static if (preserveInputOrder) ++totalLineNum;
1070         }
1071     }
1072 
1073     /* Done with input, all entries are in the reservoir. */
1074 
1075     /* The asserts here avoid issues with the current binaryheap implementation. They
1076      * detect use of backing stores having a length not synchronized to the reservoir.
1077      */
1078     immutable ulong numLines = reservoir.length;
1079     assert(numLines == dataStore.length);
1080 
1081     /* Update the backing store so it is in the desired output order.
1082      */
1083     static if (preserveInputOrder)
1084     {
1085         dataStore[].sort!((a, b) => a.lineNumber < b.lineNumber);
1086     }
1087     else
1088     {
1089         /* Output in weighted selection order. The heap is in reverse order of assigned
1090          * weights. Reversing order is done by removing all elements from the heap. This
1091          * leaves the backing store in the correct order.
1092          */
1093         while (!reservoir.empty) reservoir.removeFront;
1094     }
1095 
1096     assert(numLines == dataStore.length);
1097 
1098     foreach (entry; dataStore)
1099     {
1100         if (cmdopt.printRandom)
1101         {
1102             outputStream.formatRandomValue(entry.score);
1103             outputStream.put(cmdopt.delim);
1104         }
1105         outputStream.put(entry.line);
1106         outputStream.put("\n");
1107     }
1108  }
1109 
1110 /** Generate weighted random values for all input lines, preserving input order.
1111  *
1112  * This complements weighted reservoir sampling, but instead of using a reservoir it
1113  * simply iterates over the input lines generating the values. The weighted random
1114  * values are generated with the same formula used by reservoirSampling.
1115  */
1116 void generateWeightedRandomValuesInorder(OutputRange)
1117     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1118 if (isOutputRange!(OutputRange, char))
1119 {
1120     import std.random : Random = Mt19937, uniform01;
1121     import tsv_utils.common.utils : bufferedByLine, InputSourceRange, throwIfWindowsNewlineOnUnix;
1122 
1123     assert(cmdopt.hasWeightField);
1124 
1125     assert(!cmdopt.inputSources.empty);
1126     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1127 
1128     auto randomGenerator = Random(cmdopt.seed);
1129 
1130     /* First header is read during command line argument processing. */
1131     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1132     {
1133         auto inputStream = cmdopt.inputSources.front;
1134         throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1135 
1136         outputStream.put(cmdopt.randomValueHeader);
1137         outputStream.put(cmdopt.delim);
1138         outputStream.put(inputStream.header);
1139         outputStream.put("\n");
1140     }
1141 
1142     /* Process each line. */
1143     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
1144     ulong numLinesWritten = 0;
1145 
1146     foreach (inputStream; cmdopt.inputSources)
1147     {
1148         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1149 
1150         foreach (ulong fileLineNum, line;
1151                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
1152         {
1153             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
1154 
1155             immutable double lineWeight =
1156                 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, inputStream.name, fileLineNum);
1157 
1158             immutable double lineScore =
1159                 (lineWeight > 0.0)
1160                 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1161                 : 0.0;
1162 
1163             outputStream.formatRandomValue(lineScore);
1164             outputStream.put(cmdopt.delim);
1165             outputStream.put(line);
1166             outputStream.put("\n");
1167 
1168             if (cmdopt.sampleSize != 0)
1169             {
1170                 ++numLinesWritten;
1171                 if (numLinesWritten == cmdopt.sampleSize) return;
1172             }
1173         }
1174     }
1175 }
1176 
1177 /** Reservoir sampling via Algorithm R
1178  *
1179  * This is an implementation of reservoir sampling using what is commonly known as
1180  * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of
1181  * Computer Programming, Volume 2: Seminumerical Algorithms". More information about
1182  * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with
1183  * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling"
1184  * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R).
1185  *
1186  * Algorithm R is used for unweighted sampling without replacement. The heap-based
1187  * algorithm in reservoirSamplingViaHeap is used for weighted sampling.
1188  *
1189  * The classic algorithm stops after identifying the selected set of items. This
1190  * implementation goes one step further and randomizes the order of the selected
1191  * lines. This is consistent with shuffling (line order randomization), a primary
1192  * tsv-sample use-case.
1193  *
1194  * This algorithm is faster than reservoirSamplingViaHeap when the sample size
1195  * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size.
1196  * Insertion in this algorithm is O(1). Similarly, generating the random order in the
1197  * heap is O(k * log k), while in this algorithm the final randomization step is O(k).
1198  *
1199  * This speed advantage may be offset a certain amount by using a more expensive random
1200  * value generator. reservoirSamplingViaHeap generates values between zero and one,
1201  * whereas reservoirSamplingAlgorithmR generates random integers over and ever growing
1202  * interval. The latter is expected to be more expensive. This is consistent with
1203  * performance tests indicating that reservoirSamplingViaHeap is faster when using
1204  * small-to-medium size reservoirs and large input streams.
1205  */
1206 void reservoirSamplingAlgorithmR(Flag!"preserveInputOrder" preserveInputOrder, OutputRange)
1207     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1208 if (isOutputRange!(OutputRange, char))
1209 {
1210     import std.meta : AliasSeq;
1211     import std.random : Random = Mt19937, randomShuffle, uniform;
1212     import std.algorithm : sort;
1213     import tsv_utils.common.utils : bufferedByLine, InputSourceRange, throwIfWindowsNewlineOnUnix;
1214 
1215     assert(cmdopt.sampleSize > 0);
1216     assert(!cmdopt.hasWeightField);
1217     assert(!cmdopt.compatibilityMode);
1218     assert(!cmdopt.printRandom);
1219     assert(!cmdopt.genRandomInorder);
1220 
1221     assert(!cmdopt.inputSources.empty);
1222     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1223 
1224     static struct Entry(Flag!"preserveInputOrder" preserveInputOrder)
1225     {
1226         const(char)[] line;
1227         static if (preserveInputOrder) ulong lineNumber;
1228     }
1229 
1230     Entry!preserveInputOrder[] reservoir;
1231     auto reservoirAppender = appender(&reservoir);
1232     reservoirAppender.reserve(cmdopt.sampleSize);
1233 
1234     auto randomGenerator = Random(cmdopt.seed);
1235 
1236     /* First header is read during command line argument processing. */
1237     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1238     {
1239         auto inputStream = cmdopt.inputSources.front;
1240         throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1241 
1242         outputStream.put(inputStream.header);
1243         outputStream.put("\n");
1244     }
1245 
1246     /* Process each line. */
1247     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
1248     ulong totalLineNum = 0;
1249 
1250     foreach (inputStream; cmdopt.inputSources)
1251     {
1252         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1253 
1254         foreach (ulong fileLineNum, line;
1255                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
1256         {
1257             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
1258 
1259             /* Add lines to the reservoir until the reservoir is filled.
1260              * After that lines are added with decreasing likelihood, based on
1261              * the total number of lines seen. If added to the reservoir, the
1262              * line replaces a randomly chosen existing line.
1263              */
1264             static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum);
1265             else alias entryCTArgs = AliasSeq!();
1266 
1267             if (totalLineNum < cmdopt.sampleSize)
1268             {
1269                 reservoirAppender ~= Entry!preserveInputOrder(line.idup, entryCTArgs);
1270             }
1271             else
1272             {
1273                 immutable size_t i = uniform(0, totalLineNum, randomGenerator);
1274                 if (i < reservoir.length)
1275                 {
1276                     reservoir[i] = Entry!preserveInputOrder(line.idup, entryCTArgs);
1277                 }
1278             }
1279 
1280             ++totalLineNum;
1281         }
1282     }
1283 
1284     /* Done with input. The sample is in the reservoir. Update the order and print. */
1285 
1286     static if (preserveInputOrder)
1287     {
1288         reservoir.sort!((a, b) => a.lineNumber < b.lineNumber);
1289     }
1290     else
1291     {
1292         reservoir.randomShuffle(randomGenerator);
1293     }
1294 
1295     foreach (ref entry; reservoir)
1296     {
1297         outputStream.put(entry.line);
1298         outputStream.put("\n");
1299     }
1300 }
1301 
1302 /** Shuffling command handler. Invokes the appropriate shuffle (line order
1303  * randomization) routine based on the command line arguments.
1304  *
1305  * Shuffling has similarities to random sampling, but the algorithms used are
1306  * different. Random sampling selects a subset, only the current subset selection
1307  * needs to be kept in memory. This is supported by reservoir sampling. By contrast,
1308  * shuffling needs to hold all input in memory, so it works better to read all lines
1309  * into memory at once and then shuffle.
1310  *
1311  * Two different algorithms are used. Array shuffling is used for unweighted shuffling.
1312  * Sorting plus random weight assignments is used for weighted shuffling and when
1313  * compatibility mode is being used.
1314  *
1315  * The algorithms used here are all limited by available memory.
1316  */
1317 void shuffleCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1318 if (isOutputRange!(OutputRange, char))
1319 {
1320     if (cmdopt.hasWeightField)
1321     {
1322         randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream);
1323     }
1324     else if (cmdopt.compatibilityMode)
1325     {
1326         randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream);
1327     }
1328     else
1329     {
1330         randomizeLinesViaShuffle(cmdopt, outputStream);
1331     }
1332 }
1333 
1334 /** Shuffle all input lines by assigning random weights and sorting.
1335  *
1336  * randomizeLinesViaSort reads in all input lines and writes them out in random order.
1337  * The algorithm works by assigning a random value to each line and sorting. Both
1338  * weighted and unweighted shuffling are supported.
1339  *
1340  * Notes:
1341  * $(LIST
1342  *   * For unweighted shuffling randomizeLinesViaShuffle is faster and should be used
1343  *     unless compatibility mode is needed.
1344  *   * This routine is significantly faster than heap-based reservoir sampling in the
1345  *     case where the entire file is being read.
1346  *   * Input data must be read entirely in memory. Disk oriented techniques are needed
1347  *     when data sizes get too large for available memory. One option is to generate
1348  *     random values for each line, e.g. --gen-random-inorder, and sort with a disk-
1349  *     backed sort program like GNU sort.
1350  * )
1351  */
1352 void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange)
1353     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1354 if (isOutputRange!(OutputRange, char))
1355 {
1356     import std.algorithm : map, sort;
1357 
1358     static if (isWeighted) assert(cmdopt.hasWeightField);
1359     else assert(!cmdopt.hasWeightField);
1360 
1361     assert(cmdopt.sampleSize == 0);
1362 
1363     /*
1364      * Read all file data into memory. Then split the data into lines and assign a
1365      * random value to each line. readFileData also writes the first header line.
1366      */
1367     const fileData = readFileData!(Yes.hasRandomValue)(cmdopt, outputStream);
1368     auto inputLines = fileData.identifyInputLines!(Yes.hasRandomValue, isWeighted)(cmdopt);
1369 
1370     /*
1371      * Sort by the weight and output the lines.
1372      */
1373     inputLines.sort!((a, b) => a.randomValue > b.randomValue);
1374 
1375     foreach (lineEntry; inputLines)
1376     {
1377         if (cmdopt.printRandom)
1378         {
1379             outputStream.formatRandomValue(lineEntry.randomValue);
1380             outputStream.put(cmdopt.delim);
1381         }
1382         outputStream.put(lineEntry.data);
1383         outputStream.put("\n");
1384     }
1385 }
1386 
1387 /** Shuffle (randomize) all input lines using a shuffling algorithm.
1388  *
1389  * All lines in files and/or standard input are read in and written out in random
1390  * order. This routine uses array shuffling, which is faster than sorting. It is a
1391  * good alternative to randomizeLinesViaSort when doing unweighted shuffling (the
1392  * most common case).
1393  *
1394  * Input data size is limited by available memory. Disk oriented techniques are needed
1395  * when data sizes are larger. For example, generating random values line-by-line (ala
1396  * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort.
1397  *
1398  * This routine does not support random value printing or compatibility-mode.
1399  */
1400 void randomizeLinesViaShuffle(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1401 if (isOutputRange!(OutputRange, char))
1402 {
1403     import std.algorithm : map;
1404     import std.random : Random = Mt19937, randomShuffle;
1405 
1406     assert(cmdopt.sampleSize == 0);
1407     assert(!cmdopt.hasWeightField);
1408     assert(!cmdopt.printRandom);
1409     assert(!cmdopt.genRandomInorder);
1410 
1411     /*
1412      * Read all file data into memory and split into lines.
1413      */
1414     const fileData = readFileData!(No.hasRandomValue)(cmdopt, outputStream);
1415     auto inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt);
1416 
1417     /*
1418      * Randomly shuffle and print each line.
1419      *
1420      * Note: Also tried randomCover, but that was exceedingly slow.
1421      */
1422     import std.random : randomShuffle;
1423 
1424     auto randomGenerator = Random(cmdopt.seed);
1425     inputLines.randomShuffle(randomGenerator);
1426 
1427     foreach (ref line; inputLines)
1428     {
1429         outputStream.put(line.data);
1430         outputStream.put("\n");
1431     }
1432 }
1433 
1434 /** Simple random sampling with replacement.
1435  *
1436  * All lines in files and/or standard input are read in. Then random lines are selected
1437  * one at a time and output. Lines can be selected multiple times. This process continues
1438  * until the desired number of samples (--n|num) has been output. Output continues
1439  * indefinitely if a sample size was not provided.
1440  */
1441 void simpleRandomSamplingWithReplacement(OutputRange)
1442     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1443 if (isOutputRange!(OutputRange, char))
1444 {
1445     import std.algorithm : map;
1446     import std.random : Random = Mt19937, uniform;
1447 
1448     /*
1449      * Read all file data into memory and split the data into lines.
1450      */
1451     const fileData = readFileData!(No.hasRandomValue)(cmdopt, outputStream);
1452     const inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt);
1453 
1454     if (inputLines.length > 0)
1455     {
1456         auto randomGenerator = Random(cmdopt.seed);
1457 
1458         /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */
1459         size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize;
1460         while (numLeft != 0)
1461         {
1462             immutable size_t index = uniform(0, inputLines.length, randomGenerator);
1463             outputStream.put(inputLines[index].data);
1464             outputStream.put("\n");
1465             if (cmdopt.sampleSize != 0) numLeft--;
1466         }
1467     }
1468 }
1469 
1470 /** A container holding data read from a file or standard input.
1471  *
1472  * The InputBlock struct is used to represent a block of data read from a file or
1473  * standard input. An array of InputBlocks is returned by readFileData. Typically one
1474  * block per file. Multiple blocks are used for standard input and when the file size
1475  * cannot be determined. Individual lines are not allowed to span blocks. The blocks
1476  * allocated to an individual file are numbered starting with zero.
1477  *
1478  * See readFileData() for more information.
1479  */
1480 static struct InputBlock
1481 {
1482     string filename;          /// Original filename or path. "-" denotes standard input.
1483     size_t fileBlockNumber;   /// Zero-based block number for the file.
1484     char[] data;              /// The actual data. Newline terminated or last block for the file.
1485 }
1486 
1487 /** Read data from one or more files. This routine is used by algorithms needing to
1488  * read all data into memory.
1489  *
1490  * readFileData reads in all data from a set of files. Data is returned as an array
1491  * of InputBlock structs. Normally one InputBlock per file, sized to match the size
1492  * of the file. Standard input is read in one or more blocks, as are files whose size
1493  * cannot be determined. Multiple blocks are used in these last two cases to avoid
1494  * expensive memory reallocations. This is not necessary when file size is known as
1495  * the necessary memory can be preallocated.
1496  *
1497  * Individual lines never span multiple blocks, and newlines are preserved. This
1498  * means that each block starts at the beginning of a line and ends with a newline
1499  * unless the end of a file has been reached.
1500  *
1501  * Each file gets its own block. Prior to using InputSourceRange this was so header
1502  * processing can be done. With InputSourceRange the header is read separately, so
1503  * this could be changed.
1504  */
1505 InputBlock[] readFileData(HasRandomValue hasRandomValue, OutputRange)
1506 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1507 if (isOutputRange!(OutputRange, char))
1508 {
1509     import std.algorithm : find, min;
1510     import std.range : retro;
1511     import tsv_utils.common.utils : InputSourceRange, throwIfWindowsNewlineOnUnix;
1512 
1513     static if(!hasRandomValue) assert(!cmdopt.printRandom);
1514 
1515     assert(!cmdopt.inputSources.empty);
1516     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1517 
1518     /* First header is read during command line argument processing. */
1519     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1520     {
1521         auto inputStream = cmdopt.inputSources.front;
1522         throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1523 
1524         if (cmdopt.printRandom)
1525         {
1526             outputStream.put(cmdopt.randomValueHeader);
1527             outputStream.put(cmdopt.delim);
1528         }
1529         outputStream.put(inputStream.header);
1530         outputStream.put("\n");
1531     }
1532 
1533     enum BlockSize = 1024L * 1024L * 1024L;  // 1 GB. ('L' notation avoids overflow w/ 2GB+ sizes.)
1534     enum ReadSize = 1024L * 128L;
1535     enum NewlineSearchSize = 1024L * 16L;
1536 
1537     InputBlock[] blocks;
1538     auto blocksAppender = appender(&blocks);
1539     blocksAppender.reserve(cmdopt.inputSources.length);  // At least one block per file.
1540 
1541     ubyte[] rawReadBuffer = new ubyte[ReadSize];
1542 
1543     foreach (inputStream; cmdopt.inputSources)
1544     {
1545         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1546 
1547         /* If the file size can be determined then read it as a single block.
1548          * Otherwise read as multiple blocks. File.size() returns ulong.max
1549          * if file size cannot be determined, so we'll combine that check
1550          * with the standard input case.
1551          */
1552 
1553         immutable ulong filesize = inputStream.isStdin ? ulong.max : inputStream.file.size;
1554         auto ifile = inputStream.file;
1555 
1556         if (filesize != ulong.max)
1557         {
1558             readFileDataAsOneBlock(inputStream.name, ifile, filesize,
1559                                    blocksAppender, rawReadBuffer);
1560         }
1561         else
1562         {
1563             readFileDataAsMultipleBlocks(
1564                 inputStream.name, ifile, blocksAppender, rawReadBuffer,
1565                 BlockSize, NewlineSearchSize);
1566         }
1567     }
1568     return blocks;
1569 }
1570 
1571 /* readFileData() helper function. Read data from a File handle as a single block. The
1572  * new block is appended to an existing InputBlock[] array.
1573  *
1574  * readFileDataAsOneBlocks is part of the readFileData logic. It handles the case
1575  * where a file is being read as a single block. Normally initialBlockSize is passed
1576  * as the size of the file.
1577  *
1578  * This routine has been separated out to enable unit testing. At present it is not
1579  * intended as a general API. See readFileData for more info.
1580  */
1581 private void readFileDataAsOneBlock(
1582     string filename,
1583     ref File ifile,
1584     const ulong initialBlockSize,
1585     ref RefAppender!(InputBlock[]) blocksAppender,
1586     ref ubyte[] rawReadBuffer)
1587 {
1588     blocksAppender.put(InputBlock(filename, 0));
1589     auto dataAppender = appender(&(blocksAppender.data[$-1].data));
1590     dataAppender.reserve(initialBlockSize);
1591 
1592     foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer))
1593     {
1594         dataAppender.put(cast(char[]) buffer);
1595     }
1596 }
1597 
1598 /* readFileData() helper function. Read data from a File handle as one or more blocks.
1599  * Blocks are appended to an existing InputBlock[] array.
1600  *
1601  * readFileDataAsMultipleBlocks is part of the readFileData logic. It handles the case
1602  * where a file or standard input is being read as a series of blocks. This is the
1603  * standard approach for standard input, but also applies when the file size cannot be
1604  * determined.
1605  *
1606  * This routine has been separated out to enable unit testing. At present it is not
1607  * intended as a general API. See readFileData for more info.
1608  */
1609 private void readFileDataAsMultipleBlocks(
1610     string filename,
1611     ref File ifile,
1612     ref RefAppender!(InputBlock[]) blocksAppender,
1613     ref ubyte[] rawReadBuffer,
1614     const size_t blockSize,
1615     const size_t newlineSearchSize)
1616 {
1617     import std.algorithm : find, min;
1618     import std.range : retro;
1619 
1620     assert(ifile.isOpen);
1621 
1622     /* Create a new block for the file and an Appender for writing data.
1623      */
1624     blocksAppender.put(InputBlock(filename, 0));
1625     auto dataAppender = appender(&(blocksAppender.data[$-1].data));
1626     dataAppender.reserve(blockSize);
1627     size_t blockNumber = 0;
1628 
1629     /* Read all the data and copy it to an InputBlock. */
1630     foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer))
1631     {
1632         assert(blockNumber == blocksAppender.data[$-1].fileBlockNumber);
1633 
1634         immutable size_t remainingCapacity = dataAppender.capacity - dataAppender.data.length;
1635 
1636         if (buffer.length <= remainingCapacity)
1637         {
1638             dataAppender.put(cast(char[]) buffer);
1639         }
1640         else
1641         {
1642             /* Look for the last newline in the input buffer that fits in remaining
1643              * capacity of the block.
1644              */
1645             auto searchRegion = buffer[0 .. remainingCapacity];
1646             auto appendRegion = searchRegion.retro.find('\n').source;
1647 
1648             if (appendRegion.length > 0)
1649             {
1650                 /* Copy the first part of the read buffer to the block. */
1651                 dataAppender.put(cast(char[]) appendRegion);
1652 
1653                 /* Create a new InputBlock and copy the remaining data to it. */
1654                 blockNumber++;
1655                 blocksAppender.put(InputBlock(filename, blockNumber));
1656                 dataAppender = appender(&(blocksAppender.data[$-1].data));
1657                 dataAppender.reserve(blockSize);
1658                 dataAppender.put(cast(char[]) buffer[appendRegion.length .. $]);
1659 
1660                 assert(blocksAppender.data.length >= 2);
1661                 assert(blocksAppender.data[$-2].data[$-1] == '\n');
1662             }
1663             else
1664             {
1665                 /* Search backward in the current block for a newline. If found, it
1666                  * becomes the last newline in the current block. Anything following
1667                  * it is moved to the block. If a newline is not found, simply append
1668                  * to the current block and let it grow. We'll only search backward
1669                  * so far.
1670                  */
1671                 immutable size_t currBlockLength = blocksAppender.data[$-1].data.length;
1672                 immutable size_t searchLength = min(currBlockLength, newlineSearchSize);
1673                 immutable size_t searchStart = currBlockLength - searchLength;
1674                 auto blockSearchRegion = blocksAppender.data[$-1].data[searchStart .. $];
1675                 auto lastNewlineOffset = blockSearchRegion.retro.find('\n').source.length;
1676 
1677                 if (lastNewlineOffset != 0)
1678                 {
1679                     /* Create a new InputBlock. The previous InputBlock is then found
1680                      * at blocksAppender.data[$-2]. It may be a physically different
1681                      * struct (a copy) if the blocks array gets reallocated.
1682                      */
1683                     blockNumber++;
1684                     blocksAppender.put(InputBlock(filename, blockNumber));
1685                     dataAppender = appender(&(blocksAppender.data[$-1].data));
1686                     dataAppender.reserve(blockSize);
1687 
1688                     /* Copy data following the newline from the last block to the new
1689                      * block. Then append the current read buffer.
1690                      */
1691                     immutable size_t moveRegionStart = searchStart + lastNewlineOffset;
1692                     dataAppender.put(blocksAppender.data[$-2].data[moveRegionStart .. $]);
1693                     dataAppender.put(cast(char[]) buffer);
1694 
1695                     /* Now delete the moved region from the last block. */
1696                     blocksAppender.data[$-2].data.length = moveRegionStart;
1697 
1698                     assert(blocksAppender.data.length >= 2);
1699                     assert(blocksAppender.data[$-2].data[$-1] == '\n');
1700                 }
1701                 else
1702                 {
1703                     /* Give up. Allow the current block to grow. */
1704                     dataAppender.put(cast(char[]) buffer);
1705                 }
1706             }
1707         }
1708     }
1709 }
1710 
1711 /** HasRandomValue is a boolean flag used at compile time by identifyInputLines to
1712  * distinguish use cases needing random value assignments from those that don't.
1713  */
1714 alias HasRandomValue = Flag!"hasRandomValue";
1715 
1716 /** An InputLine array is returned by identifyInputLines to represent each non-header line
1717  * line found in a FileData array. The 'data' element contains the line. A 'randomValue'
1718  * line is included if random values are being generated.
1719  */
1720 static struct InputLine(HasRandomValue hasRandomValue)
1721 {
1722     const(char)[] data;
1723     static if (hasRandomValue) double randomValue;
1724 }
1725 
1726 /** identifyInputLines is used by algorithms that read all files into memory prior to
1727  * processing. It does the initial processing of the file data.
1728  *
1729  * Two main tasks are performed. One is splitting all input data into lines. The second
1730  * is assigning a random value to the line, if random values are being generated.
1731  *
1732  * The key input is an InputBlock array. Normally one block for each file, but standard
1733  * input may have multiple blocks.
1734  *
1735  * The return value is an array of InputLine structs. The struct will have a 'randomValue'
1736  * member if random values are being assigned.
1737  */
1738 InputLine!hasRandomValue[] identifyInputLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted)
1739 (const ref InputBlock[] inputBlocks, ref TsvSampleOptions cmdopt)
1740 {
1741     import std.algorithm : splitter;
1742     import std.array : appender;
1743     import std.random : Random = Mt19937, uniform01;
1744     import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
1745 
1746     static assert(hasRandomValue || !isWeighted);
1747     static if(!hasRandomValue) assert(!cmdopt.printRandom);
1748 
1749     InputLine!hasRandomValue[] inputLines;
1750 
1751     auto linesAppender = appender(&inputLines);
1752     static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed);
1753 
1754     /* Note: fileLineNum is zero-based here. One-based in most other code in this file. */
1755     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 1 : 0;
1756     size_t fileLineNum = fileBodyStartLine;
1757 
1758     foreach (block; inputBlocks)
1759     {
1760         /* Drop the last newline to avoid adding an extra empty line. */
1761         const data = (block.data.length > 0 && block.data[$-1] == '\n') ?
1762             block.data[0 .. $-1] : block.data;
1763 
1764         if (block.fileBlockNumber == 0) fileLineNum = fileBodyStartLine;
1765 
1766         foreach (ref line; data.splitter('\n'))
1767         {
1768             fileLineNum++;
1769 
1770             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, block.filename, fileLineNum);
1771 
1772             static if (!hasRandomValue)
1773             {
1774                 linesAppender.put(InputLine!hasRandomValue(line));
1775             }
1776             else
1777             {
1778                 static if (!isWeighted)
1779                 {
1780                     immutable double randomValue = uniform01(randomGenerator);
1781                 }
1782                 else
1783                 {
1784                     immutable double lineWeight =
1785                         getFieldValue!double(line, cmdopt.weightField, cmdopt.delim,
1786                                              block.filename, fileLineNum);
1787                     immutable double randomValue =
1788                         (lineWeight > 0.0)
1789                         ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1790                         : 0.0;
1791                 }
1792 
1793                 linesAppender.put(InputLine!hasRandomValue(line, randomValue));
1794             }
1795         }
1796     }
1797 
1798     return inputLines;
1799 }
1800 
1801 
1802 /* Unit tests for ReadFileData. These tests focus on multiple InputBlock scenarios.
1803  * Other use paths are well tested by the tests at the end cases.
1804  */
1805 unittest
1806 {
1807     import tsv_utils.common.unittest_utils;
1808     import std.algorithm : equal, find, joiner, splitter;
1809     import std.array : appender;
1810     import std.file : rmdirRecurse;
1811     import std.path : buildPath;
1812     import std.range : repeat;
1813 
1814     auto rfdTestDir = makeUnittestTempDir("tsv_sample_readFileData");
1815     scope(exit) rfdTestDir.rmdirRecurse;
1816 
1817     char[] file1Data;
1818     char[] file2Data;
1819     char[] file3Data;
1820 
1821     auto app1 = appender(&file1Data);
1822     auto app2 = appender(&file2Data);
1823     auto app3 = appender(&file3Data);
1824 
1825     /* File 1: 1000 short lines. */
1826     app1.put("\n".repeat(100).joiner);
1827     app1.put("x\n".repeat(100).joiner);
1828     app1.put("yz\n".repeat(100).joiner);
1829     app1.put("pqr\n".repeat(100).joiner);
1830     app1.put("a\nbc\ndef\n".repeat(100).joiner);
1831     app1.put('\n'.repeat(100));
1832     app1.put("z\n".repeat(100).joiner);
1833     app1.put("xy\n".repeat(100).joiner);
1834 
1835     /* File 2: 500 longer lines. */
1836     app2.put(
1837         "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n"
1838         .repeat(100)
1839         .joiner);
1840     app2.put(
1841         "|abcdefghijklmnopqrstuv|\n|0123456789|\n|0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ|\n|abcdefghijklmnopqrstuvwxyz|\n"
1842         .repeat(100)
1843         .joiner);
1844     app2.put(
1845          "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n"
1846         .repeat(100)
1847         .joiner);
1848 
1849     /* File 3: 1000 mixed length lines. */
1850     app3.put("\n\n|abcde|\n1\n12\n123\n|abcdefghijklmnop|\n|xyz|\n0123456789\nX\n".repeat(100).joiner);
1851 
1852     string file1Path = buildPath(rfdTestDir, "file1.txt");
1853     string file2Path = buildPath(rfdTestDir, "file2.txt");
1854     string file3Path = buildPath(rfdTestDir, "file3.txt");
1855 
1856     try
1857     {
1858         auto ofile1 = File(file1Path, "w");
1859         ofile1.write(file1Data);
1860     }
1861     catch (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file1Path, e.msg));
1862 
1863     try
1864     {
1865         auto ofile2 = File(file2Path, "w");
1866         ofile2.write(file2Data);
1867     }
1868     catch (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file2Path, e.msg));
1869 
1870     try
1871     {
1872         auto ofile3 = File(file3Path, "w");
1873         ofile3.write(file3Data);
1874     }
1875     catch  (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file3Path, e.msg));
1876 
1877     auto allData = file1Data ~ file2Data ~ file3Data;
1878     auto expectedLines = allData.splitter('\n').array[0 .. $-1];
1879 
1880     auto file2DataNoHeader = (file2Data.find('\n'))[1 .. $];
1881     auto file3DataNoHeader = (file3Data.find('\n'))[1 .. $];
1882     auto allDataUsingHeader = file1Data ~ file2DataNoHeader ~ file3DataNoHeader;
1883     auto expectedLinesUsingHeader = allDataUsingHeader.splitter('\n').array[0 .. $-1];
1884 
1885     assert(expectedLines.length == expectedLinesUsingHeader.length + 2);
1886 
1887     TsvSampleOptions cmdoptNoHeader;
1888     auto noHeaderCmdArgs = ["unittest", file1Path];
1889     auto r1 = cmdoptNoHeader.processArgs(noHeaderCmdArgs);
1890     assert(r1[0], format("Invalid command lines arg: '%s'.", noHeaderCmdArgs));
1891 
1892     TsvSampleOptions cmdoptYesHeader;
1893     auto yesHeaderCmdArgs = ["unittest", "--header", file1Path];
1894     auto r2 = cmdoptYesHeader.processArgs(yesHeaderCmdArgs);
1895     assert(r2[0], format("Invalid command lines arg: '%s'.", yesHeaderCmdArgs));
1896 
1897     auto outputStream = appender!(char[])();
1898 
1899     {
1900         /* Reading as single blocks. */
1901         ubyte[] rawReadBuffer = new ubyte[256];
1902         InputBlock[] blocks;
1903         auto blocksAppender = appender(&blocks);
1904         blocksAppender.reserve(3);
1905         foreach (f; [ file1Path, file2Path, file3Path ])
1906         {
1907             auto ifile = f.File;
1908             ulong filesize = ifile.size;
1909             if (filesize == ulong.max) filesize = 1000;
1910             readFileDataAsOneBlock(f, ifile, filesize, blocksAppender, rawReadBuffer);
1911         }
1912         auto inputLines =
1913             identifyInputLines!(No.hasRandomValue, No.isWeighted)(
1914                 blocks, cmdoptNoHeader);
1915 
1916         assert(equal!((a, b) => a.data == b)(inputLines, expectedLines));
1917     }
1918 
1919     {
1920         /* Reading as multiple blocks. */
1921         foreach (size_t searchSize; [ 0, 1, 2, 64 ])
1922         {
1923             foreach (size_t blockSize; [ 1, 2, 16, 64, 256 ])
1924             {
1925                 foreach (size_t readSize; [ 1, 2, 8, 32 ])
1926                 {
1927                     ubyte[] rawReadBuffer = new ubyte[readSize];
1928                     InputBlock[] blocks;
1929                     auto blocksAppender = appender(&blocks);
1930                     blocksAppender.reserve(3);
1931                     foreach (f; [ file1Path, file2Path, file3Path ])
1932                     {
1933                         auto ifile = f.File;
1934                         readFileDataAsMultipleBlocks(f, ifile, blocksAppender,
1935                                                      rawReadBuffer, blockSize, searchSize);
1936                     }
1937                     auto inputLines =
1938                         identifyInputLines!(No.hasRandomValue, No.isWeighted)(
1939                             blocks, cmdoptNoHeader);
1940 
1941                     assert(equal!((a, b) => a.data == b)(inputLines, expectedLines));
1942                 }
1943             }
1944         }
1945     }
1946     version(none) {
1947     {
1948         /* Reading as multiple blocks, with header processing. */
1949         const size_t readSize = 32;
1950         const size_t blockSize = 48;
1951         const size_t searchSize = 16;
1952 
1953         ubyte[] rawReadBuffer = new ubyte[readSize];
1954         InputBlock[] blocks;
1955         auto blocksAppender = appender(&blocks);
1956         blocksAppender.reserve(3);
1957         foreach (f; [ file1Path, file2Path, file3Path ])
1958         {
1959             auto ifile = f.File;
1960             readFileDataAsMultipleBlocks(f, ifile, blocksAppender,
1961                                          rawReadBuffer, blockSize, searchSize);
1962         }
1963         auto inputLines =
1964             identifyInputLines!(No.hasRandomValue, No.isWeighted)(
1965                 blocks, cmdoptYesHeader);
1966 
1967         assert(outputStream.data == expectedLinesUsingHeader[0] ~ '\n');
1968         assert(equal!((a, b) => a.data == b)(inputLines, expectedLinesUsingHeader[1 .. $]));
1969     }
1970     }
1971 }
1972 
1973 /** Write a floating point random value to an output stream.
1974  *
1975  * This routine is used for floating point random value printing. This routine writes
1976  * 17 significant digits, the range available in doubles. This routine prefers decimal
1977  * format, without exponents. It will generate somewhat large precision numbers,
1978  * currently up to 28 digits, before switching to exponents.
1979  *
1980  * The primary reason for this approach is to enable faster sorting on random values
1981  * by GNU sort and similar external sorting programs. GNU sort is dramatically faster
1982  * on decimal format numeric sorts ('n' switch) than general numeric sorts ('g' switch).
1983  * The 'general numeric' handles exponential notation. The difference is 5-10x.
1984  *
1985  * Random values generated by Bernoulli sampling are nearly always greater than 1e-12.
1986  * No examples less than 1e-09 were seen in hundred of millions of trials. Similar
1987  * results were seen with weighted sampling with integer weights. The same is not true
1988  * with floating point weights. These produce quite large exponents. However, even
1989  * for floating point weights this can be useful. For random weights [0,1] less than 5%
1990  * will be less than 1e-12 and use exponential notation.
1991  */
1992 void formatRandomValue(OutputRange)(auto ref OutputRange outputStream, double value)
1993 if (isOutputRange!(OutputRange, char))
1994 {
1995     import std.format : formatValue, singleSpec;
1996 
1997     immutable spec17f = singleSpec("%.17f");
1998     immutable spec18f = singleSpec("%.18f");
1999     immutable spec19f = singleSpec("%.19f");
2000     immutable spec20f = singleSpec("%.20f");
2001     immutable spec21f = singleSpec("%.21f");
2002     immutable spec22f = singleSpec("%.22f");
2003     immutable spec23f = singleSpec("%.23f");
2004     immutable spec24f = singleSpec("%.24f");
2005     immutable spec25f = singleSpec("%.25f");
2006     immutable spec26f = singleSpec("%.26f");
2007     immutable spec27f = singleSpec("%.27f");
2008     immutable spec28f = singleSpec("%.28f");
2009 
2010     immutable spec17g = singleSpec("%.17g");
2011 
2012     immutable formatSpec =
2013         (value >= 1e-01) ? spec17f :
2014         (value >= 1e-02) ? spec18f :
2015         (value >= 1e-03) ? spec19f :
2016         (value >= 1e-04) ? spec20f :
2017         (value >= 1e-05) ? spec21f :
2018         (value >= 1e-06) ? spec22f :
2019         (value >= 1e-07) ? spec23f :
2020         (value >= 1e-08) ? spec24f :
2021         (value >= 1e-09) ? spec25f :
2022         (value >= 1e-10) ? spec26f :
2023         (value >= 1e-11) ? spec27f :
2024         (value >= 1e-12) ? spec28f : spec17g;
2025 
2026     outputStream.formatValue(value, formatSpec);
2027 }
2028 
2029 @safe unittest
2030 {
2031     void testFormatValue(double value, string expected)
2032     {
2033         import std.array : appender;
2034 
2035         auto s = appender!string();
2036         s.formatRandomValue(value);
2037         assert(s.data == expected,
2038                format("[testFormatValue] value: %g; expected: %s; actual: %s", value, expected, s.data));
2039     }
2040 
2041     testFormatValue(1.0,   "1.00000000000000000");
2042     testFormatValue(0.1,   "0.10000000000000001");
2043     testFormatValue(0.01,  "0.010000000000000000");
2044     testFormatValue(1e-03, "0.0010000000000000000");
2045     testFormatValue(1e-04, "0.00010000000000000000");
2046     testFormatValue(1e-05, "0.000010000000000000001");
2047     testFormatValue(1e-06, "0.0000010000000000000000");
2048     testFormatValue(1e-07, "0.00000010000000000000000");
2049     testFormatValue(1e-08, "0.000000010000000000000000");
2050     testFormatValue(1e-09, "0.0000000010000000000000001");
2051     testFormatValue(1e-10, "0.00000000010000000000000000");
2052     testFormatValue(1e-11, "0.000000000009999999999999999");
2053     testFormatValue(1e-12, "0.0000000000010000000000000000");
2054     testFormatValue(1e-13, "1e-13");
2055     testFormatValue(1e-14, "1e-14");
2056     testFormatValue(12345678901234567e-15, "12.34567890123456735");
2057     testFormatValue(12345678901234567e-16, "1.23456789012345669");
2058     testFormatValue(12345678901234567e-17, "0.12345678901234566");
2059     testFormatValue(12345678901234567e-18, "0.012345678901234567");
2060     testFormatValue(12345678901234567e-19, "0.0012345678901234567");
2061     testFormatValue(12345678901234567e-20, "0.00012345678901234567");
2062     testFormatValue(12345678901234567e-21, "0.000012345678901234568");
2063     testFormatValue(12345678901234567e-22, "0.0000012345678901234567");
2064     testFormatValue(12345678901234567e-23, "0.00000012345678901234566");
2065     testFormatValue(12345678901234567e-24, "0.000000012345678901234567");
2066     testFormatValue(12345678901234567e-25, "0.0000000012345678901234566");
2067     testFormatValue(12345678901234567e-26, "0.00000000012345678901234568");
2068     testFormatValue(12345678901234567e-27, "0.000000000012345678901234567");
2069     testFormatValue(12345678901234567e-28, "0.0000000000012345678901234567");
2070     testFormatValue(12345678901234567e-29, "1.2345678901234566e-13");
2071 }
2072 
2073 
2074 /** Convenience function for extracting a single field from a line. See
2075  * [tsv_utils.common.utils.getTsvFieldValue] for details. This wrapper creates error
2076  * text tailored for this program.
2077  */
2078 import std.traits : isSomeChar;
2079 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, ulong lineNum) pure @safe
2080 if (isSomeChar!C)
2081 {
2082     import std.conv : ConvException, to;
2083     import tsv_utils.common.utils : getTsvFieldValue;
2084 
2085     T val;
2086     try
2087     {
2088         val = getTsvFieldValue!T(line, fieldIndex, delim);
2089     }
2090     catch (ConvException exc)
2091     {
2092         throw new Exception(
2093             format("Could not process line: %s\n  File: %s Line: %s%s",
2094                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
2095                    (lineNum == 1) ? "\n  Is this a header line? Use --H|header to skip." : ""));
2096     }
2097     catch (Exception exc)
2098     {
2099         /* Not enough fields on the line. */
2100         throw new Exception(
2101             format("Could not process line: %s\n  File: %s Line: %s",
2102                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum));
2103     }
2104 
2105     return val;
2106 }
2107 
2108 @safe unittest
2109 {
2110     /* getFieldValue unit tests. getTsvFieldValue has it's own tests.
2111      * These tests make basic sanity checks on the getFieldValue wrapper.
2112      */
2113     import std.exception;
2114 
2115     assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123);
2116     assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4);
2117     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1));
2118     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2));
2119     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1));
2120     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2));
2121 }
2122 
2123 /* Unit tests for the main program start here.
2124  *
2125  * Portability note: Many of the tests here rely on generating consistent random numbers
2126  * across different platforms when using the same random seed. So far this has succeeded
2127  * on several different platform, compiler, and library versions. However, it is certainly
2128  * possible this condition will not hold on other platforms.
2129  *
2130  * For tsv-sample, this portability implies generating the same results on different
2131  * platforms when using the same random seed. This is NOT part of tsv-sample guarantees,
2132  * but it is convenient for testing. If platforms are identified that do not generate
2133  * the same results these tests will need to be adjusted.
2134  */
2135 version(unittest)
2136 {
2137     /* Unit test helper functions. */
2138 
2139     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
2140     import std.conv : to;
2141 
2142     void testTsvSample(string[] cmdArgs, string[][] expected)
2143     {
2144         import std.array : appender;
2145 
2146         assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty.");
2147 
2148         auto formatAssertMessage(T...)(string msg, T formatArgs)
2149         {
2150             auto formatString = "[testTsvSample] %s: " ~ msg;
2151             return format(formatString, cmdArgs[0], formatArgs);
2152         }
2153 
2154         TsvSampleOptions cmdopt;
2155         auto savedCmdArgs = cmdArgs.to!string;
2156         auto r = cmdopt.processArgs(cmdArgs);
2157         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
2158         auto output = appender!(char[])();
2159 
2160         tsvSample(cmdopt, output);    // This invokes the main code line.
2161 
2162         auto expectedOutput = expected.tsvDataToString;
2163 
2164         assert(output.data == expectedOutput,
2165                formatAssertMessage(
2166                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
2167                    expectedOutput.to!string, output.data.to!string));
2168     }
2169  }
2170 
2171 unittest
2172 {
2173     import std.path : buildPath;
2174     import std.file : rmdirRecurse;
2175 
2176     auto testDir = makeUnittestTempDir("tsv_sample");
2177     scope(exit) testDir.rmdirRecurse;
2178 
2179     /* Tabular data sets and expected results use the built-in static seed.
2180      * Tests are run by writing the data set to a file, then calling the main
2181      * routine to process. The function testTsvSample plays the role of the
2182      * main program. Rather than writing to expected output, the results are
2183      * matched against expected. The expected results were verified by hand
2184      * prior to inclusion in the test.
2185      *
2186      * The initial part of this section is simply setting up data files and
2187      * expected results.
2188      *
2189      * Expected results naming conventions:
2190      *  - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected
2191      *  - Sampling Type (required): Permute (Shuffle), Sample, Replace, Bernoulli, Distinct
2192      *  - Compatibility: Compat, AlgoR, Skip, Swap, Inorder
2193      *  - Weight Field: Wt<num>, e.g. Wt3
2194      *  - Sample Size: Num<num>, eg. Num3
2195      *  - Seed Value: V<num>, eg. V77
2196      *  - Key Field: K<num>, e.g. K2
2197      *  - Probability: P<num>, e.g P05 (5%)
2198      *  - Printing Probabilities: Probs
2199      *  - Printing Probs in order: ProbsInorder
2200      *  - Printing Probs with custom header: RVCustom
2201      */
2202 
2203     /* Empty file. */
2204     string[][] dataEmpty = [];
2205     string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv");
2206     writeUnittestTsvFile(fpath_dataEmpty, dataEmpty);
2207 
2208     /* 3x1, header only. */
2209     string[][] data3x0 = [["field_a", "field_b", "field_c"]];
2210     string fpath_data3x0 = buildPath(testDir, "data3x0.tsv");
2211     writeUnittestTsvFile(fpath_data3x0, data3x0);
2212 
2213     /* 3x1 */
2214     string[][] data3x1 =
2215         [["field_a", "field_b", "field_c"],
2216          ["tan", "タン", "8.5"]];
2217 
2218     string fpath_data3x1 = buildPath(testDir, "data3x1.tsv");
2219     string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv");
2220     writeUnittestTsvFile(fpath_data3x1, data3x1);
2221     writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1 .. $]);
2222 
2223     string[][] data3x1ExpectedReplaceNum3 =
2224         [["field_a", "field_b", "field_c"],
2225          ["tan", "タン", "8.5"],
2226          ["tan", "タン", "8.5"],
2227          ["tan", "タン", "8.5"]];
2228 
2229     /* 3x2 */
2230     string[][] data3x2 =
2231         [["field_a", "field_b", "field_c"],
2232          ["brown", "褐色", "29.2"],
2233          ["gray", "グレー", "6.2"]];
2234 
2235     string fpath_data3x2 = buildPath(testDir, "data3x2.tsv");
2236     string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv");
2237     writeUnittestTsvFile(fpath_data3x2, data3x2);
2238     writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1 .. $]);
2239 
2240     string[][] data3x2PermuteCompat =
2241         [["field_a", "field_b", "field_c"],
2242          ["gray", "グレー", "6.2"],
2243          ["brown", "褐色", "29.2"]];
2244 
2245     string[][] data3x2PermuteShuffle =
2246         [["field_a", "field_b", "field_c"],
2247          ["gray", "グレー", "6.2"],
2248          ["brown", "褐色", "29.2"]];
2249 
2250     /* 3x3 */
2251     string[][] data3x3 =
2252         [["field_a", "field_b", "field_c"],
2253          ["orange", "オレンジ", "2.5"],
2254          ["pink", "ピンク", "1.1"],
2255          ["purple", "紫の", "42"]];
2256 
2257     string fpath_data3x3 = buildPath(testDir, "data3x3.tsv");
2258     string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv");
2259     writeUnittestTsvFile(fpath_data3x3, data3x3);
2260     writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1 .. $]);
2261 
2262     string[][] data3x3ExpectedPermuteCompat =
2263         [["field_a", "field_b", "field_c"],
2264          ["purple", "紫の", "42"],
2265          ["pink", "ピンク", "1.1"],
2266          ["orange", "オレンジ", "2.5"]];
2267 
2268     string[][] data3x3ExpectedPermuteSwap =
2269         [["field_a", "field_b", "field_c"],
2270          ["purple", "紫の", "42"],
2271          ["orange", "オレンジ", "2.5"],
2272          ["pink", "ピンク", "1.1"]];
2273 
2274     /* 3x6 */
2275     string[][] data3x6 =
2276         [["field_a", "field_b", "field_c"],
2277          ["red", "赤", "23.8"],
2278          ["green", "緑", "0.0072"],
2279          ["white", "白", "1.65"],
2280          ["yellow", "黄", "12"],
2281          ["blue", "青", "12"],
2282          ["black", "黒", "0.983"]];
2283     string fpath_data3x6 = buildPath(testDir, "data3x6.tsv");
2284     string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv");
2285     writeUnittestTsvFile(fpath_data3x6, data3x6);
2286     writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1 .. $]);
2287 
2288     // Randomization, all lines
2289     string[][] data3x6ExpectedPermuteCompat =
2290         [["field_a", "field_b", "field_c"],
2291          ["yellow", "黄", "12"],
2292          ["black", "黒", "0.983"],
2293          ["blue", "青", "12"],
2294          ["white", "白", "1.65"],
2295          ["green", "緑", "0.0072"],
2296          ["red", "赤", "23.8"]];
2297 
2298     string[][] data3x6ExpectedPermuteSwap =
2299         [["field_a", "field_b", "field_c"],
2300          ["black", "黒", "0.983"],
2301          ["green", "緑", "0.0072"],
2302          ["red", "赤", "23.8"],
2303          ["yellow", "黄", "12"],
2304          ["white", "白", "1.65"],
2305          ["blue", "青", "12"]];
2306 
2307     string[][] data3x6ExpectedPermuteCompatProbs =
2308         [["random_value", "field_a", "field_b", "field_c"],
2309          ["0.96055546286515892", "yellow", "黄", "12"],
2310          ["0.75710153928957880", "black", "黒", "0.983"],
2311          ["0.52525980887003243", "blue", "青", "12"],
2312          ["0.49287854949943721", "white", "白", "1.65"],
2313          ["0.15929344086907804", "green", "緑", "0.0072"],
2314          ["0.010968807619065046", "red", "赤", "23.8"]];
2315 
2316     /* Note: data3x6ExpectedSampleAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because
2317      * both are effectively the same algorithm given that --num is data length. Both read
2318      * in the full data in order then call randomShuffle.
2319      */
2320     string[][] data3x6ExpectedSampleAlgoRNum6 =
2321         [["field_a", "field_b", "field_c"],
2322          ["black", "黒", "0.983"],
2323          ["green", "緑", "0.0072"],
2324          ["red", "赤", "23.8"],
2325          ["yellow", "黄", "12"],
2326          ["white", "白", "1.65"],
2327          ["blue", "青", "12"]];
2328 
2329     string[][] data3x6ExpectedSampleAlgoRNum5 =
2330         [["field_a", "field_b", "field_c"],
2331          ["red", "赤", "23.8"],
2332          ["black", "黒", "0.983"],
2333          ["white", "白", "1.65"],
2334          ["green", "緑", "0.0072"],
2335          ["yellow", "黄", "12"]];
2336 
2337     string[][] data3x6ExpectedSampleAlgoRNum4 =
2338         [["field_a", "field_b", "field_c"],
2339          ["blue", "青", "12"],
2340          ["green", "緑", "0.0072"],
2341          ["black", "黒", "0.983"],
2342          ["white", "白", "1.65"]];
2343 
2344     string[][] data3x6ExpectedSampleAlgoRNum3 =
2345         [["field_a", "field_b", "field_c"],
2346          ["red", "赤", "23.8"],
2347          ["black", "黒", "0.983"],
2348          ["green", "緑", "0.0072"]];
2349 
2350     string[][] data3x6ExpectedSampleAlgoRNum2 =
2351         [["field_a", "field_b", "field_c"],
2352          ["black", "黒", "0.983"],
2353          ["red", "赤", "23.8"]];
2354 
2355     string[][] data3x6ExpectedSampleAlgoRNum1 =
2356         [["field_a", "field_b", "field_c"],
2357          ["green", "緑", "0.0072"]];
2358 
2359     /* Inorder versions. */
2360     string[][] data3x6ExpectedSampleAlgoRNum6Inorder =
2361         [["field_a", "field_b", "field_c"],
2362          ["red", "赤", "23.8"],
2363          ["green", "緑", "0.0072"],
2364          ["white", "白", "1.65"],
2365          ["yellow", "黄", "12"],
2366          ["blue", "青", "12"],
2367          ["black", "黒", "0.983"]];
2368 
2369     string[][] data3x6ExpectedSampleAlgoRNum5Inorder =
2370         [["field_a", "field_b", "field_c"],
2371          ["red", "赤", "23.8"],
2372          ["green", "緑", "0.0072"],
2373          ["white", "白", "1.65"],
2374          ["yellow", "黄", "12"],
2375          ["black", "黒", "0.983"]];
2376 
2377     string[][] data3x6ExpectedSampleAlgoRNum4Inorder =
2378         [["field_a", "field_b", "field_c"],
2379          ["green", "緑", "0.0072"],
2380          ["white", "白", "1.65"],
2381          ["blue", "青", "12"],
2382          ["black", "黒", "0.983"]];
2383 
2384     string[][] data3x6ExpectedSampleAlgoRNum3Inorder =
2385         [["field_a", "field_b", "field_c"],
2386          ["red", "赤", "23.8"],
2387          ["green", "緑", "0.0072"],
2388          ["black", "黒", "0.983"]];
2389 
2390     string[][] data3x6ExpectedSampleAlgoRNum2Inorder =
2391         [["field_a", "field_b", "field_c"],
2392          ["red", "赤", "23.8"],
2393          ["black", "黒", "0.983"]];
2394 
2395     string[][] data3x6ExpectedSampleAlgoRNum1Inorder =
2396         [["field_a", "field_b", "field_c"],
2397          ["green", "緑", "0.0072"]];
2398 
2399     /* Reservoir inorder */
2400     string[][] data3x6ExpectedSampleCompatNum6Inorder =
2401         [["field_a", "field_b", "field_c"],
2402          ["red", "赤", "23.8"],
2403          ["green", "緑", "0.0072"],
2404          ["white", "白", "1.65"],
2405          ["yellow", "黄", "12"],
2406          ["blue", "青", "12"],
2407          ["black", "黒", "0.983"]];
2408 
2409     string[][] data3x6ExpectedSampleCompatNum5Inorder =
2410         [["field_a", "field_b", "field_c"],
2411          ["green", "緑", "0.0072"],
2412          ["white", "白", "1.65"],
2413          ["yellow", "黄", "12"],
2414          ["blue", "青", "12"],
2415          ["black", "黒", "0.983"]];
2416 
2417     string[][] data3x6ExpectedSampleCompatNum4Inorder =
2418         [["field_a", "field_b", "field_c"],
2419          ["white", "白", "1.65"],
2420          ["yellow", "黄", "12"],
2421          ["blue", "青", "12"],
2422          ["black", "黒", "0.983"]];
2423 
2424     string[][] data3x6ExpectedSampleCompatNum3Inorder =
2425         [["field_a", "field_b", "field_c"],
2426          ["yellow", "黄", "12"],
2427          ["blue", "青", "12"],
2428          ["black", "黒", "0.983"]];
2429 
2430     string[][] data3x6ExpectedSampleCompatNum2Inorder =
2431         [["field_a", "field_b", "field_c"],
2432          ["yellow", "黄", "12"],
2433          ["black", "黒", "0.983"]];
2434 
2435     string[][] data3x6ExpectedSampleCompatNum1Inorder =
2436         [["field_a", "field_b", "field_c"],
2437          ["yellow", "黄", "12"]];
2438 
2439 
2440     /* Reservoir inorder with probabilities. */
2441     string[][] data3x6ExpectedSampleCompatNum6ProbsInorder =
2442         [["random_value", "field_a", "field_b", "field_c"],
2443          ["0.010968807619065046", "red", "赤", "23.8"],
2444          ["0.15929344086907804", "green", "緑", "0.0072"],
2445          ["0.49287854949943721", "white", "白", "1.65"],
2446          ["0.96055546286515892", "yellow", "黄", "12"],
2447          ["0.52525980887003243", "blue", "青", "12"],
2448          ["0.75710153928957880", "black", "黒", "0.983"]];
2449 
2450     string[][] data3x6ExpectedSampleCompatNum5ProbsInorder =
2451         [["random_value", "field_a", "field_b", "field_c"],
2452          ["0.15929344086907804", "green", "緑", "0.0072"],
2453          ["0.49287854949943721", "white", "白", "1.65"],
2454          ["0.96055546286515892", "yellow", "黄", "12"],
2455          ["0.52525980887003243", "blue", "青", "12"],
2456          ["0.75710153928957880", "black", "黒", "0.983"]];
2457 
2458     string[][] data3x6ExpectedSampleCompatNum4ProbsInorder =
2459         [["random_value", "field_a", "field_b", "field_c"],
2460          ["0.49287854949943721", "white", "白", "1.65"],
2461          ["0.96055546286515892", "yellow", "黄", "12"],
2462          ["0.52525980887003243", "blue", "青", "12"],
2463          ["0.75710153928957880", "black", "黒", "0.983"]];
2464 
2465     string[][] data3x6ExpectedSampleCompatNum3ProbsInorder =
2466         [["random_value", "field_a", "field_b", "field_c"],
2467          ["0.96055546286515892", "yellow", "黄", "12"],
2468          ["0.52525980887003243", "blue", "青", "12"],
2469          ["0.75710153928957880", "black", "黒", "0.983"]];
2470 
2471     string[][] data3x6ExpectedSampleCompatNum2ProbsInorder =
2472         [["random_value", "field_a", "field_b", "field_c"],
2473          ["0.96055546286515892", "yellow", "黄", "12"],
2474          ["0.75710153928957880", "black", "黒", "0.983"]];
2475 
2476     string[][] data3x6ExpectedSampleCompatNum1ProbsInorder =
2477         [["random_value", "field_a", "field_b", "field_c"],
2478          ["0.96055546286515892", "yellow", "黄", "12"]];
2479 
2480     string[][] data3x6ExpectedWt3Num6Inorder =
2481         [["field_a", "field_b", "field_c"],
2482          ["red", "赤", "23.8"],
2483          ["green", "緑", "0.0072"],
2484          ["white", "白", "1.65"],
2485          ["yellow", "黄", "12"],
2486          ["blue", "青", "12"],
2487          ["black", "黒", "0.983"]];
2488 
2489     string[][] data3x6ExpectedWt3Num5Inorder =
2490         [["field_a", "field_b", "field_c"],
2491          ["green", "緑", "0.0072"],
2492          ["white", "白", "1.65"],
2493          ["yellow", "黄", "12"],
2494          ["blue", "青", "12"],
2495          ["black", "黒", "0.983"]];
2496 
2497     string[][] data3x6ExpectedWt3Num4Inorder =
2498         [["field_a", "field_b", "field_c"],
2499          ["white", "白", "1.65"],
2500          ["yellow", "黄", "12"],
2501          ["blue", "青", "12"],
2502          ["black", "黒", "0.983"]];
2503 
2504     string[][] data3x6ExpectedWt3Num3Inorder =
2505         [["field_a", "field_b", "field_c"],
2506          ["yellow", "黄", "12"],
2507          ["blue", "青", "12"],
2508          ["black", "黒", "0.983"]];
2509 
2510     string[][] data3x6ExpectedWt3Num2Inorder =
2511         [["field_a", "field_b", "field_c"],
2512          ["yellow", "黄", "12"],
2513          ["black", "黒", "0.983"]];
2514 
2515     string[][] data3x6ExpectedWt3Num1Inorder =
2516         [["field_a", "field_b", "field_c"],
2517          ["yellow", "黄", "12"]];
2518 
2519 
2520     string[][] data3x6ExpectedBernoulliProbsP100 =
2521         [["random_value", "field_a", "field_b", "field_c"],
2522          ["0.010968807619065046", "red", "赤", "23.8"],
2523          ["0.15929344086907804", "green", "緑", "0.0072"],
2524          ["0.49287854949943721", "white", "白", "1.65"],
2525          ["0.96055546286515892", "yellow", "黄", "12"],
2526          ["0.52525980887003243", "blue", "青", "12"],
2527          ["0.75710153928957880", "black", "黒", "0.983"]];
2528 
2529     string[][] data3x6ExpectedBernoulliCompatProbsP60 =
2530         [["random_value", "field_a", "field_b", "field_c"],
2531          ["0.010968807619065046", "red", "赤", "23.8"],
2532          ["0.15929344086907804", "green", "緑", "0.0072"],
2533          ["0.49287854949943721", "white", "白", "1.65"],
2534          ["0.52525980887003243", "blue", "青", "12"]];
2535 
2536     string[][] data3x6ExpectedBernoulliSkipP40 =
2537         [["field_a", "field_b", "field_c"],
2538          ["red", "赤", "23.8"],
2539          ["green", "緑", "0.0072"],
2540          ["yellow", "黄", "12"]];
2541 
2542     string[][] data3x6ExpectedBernoulliCompatP60 =
2543         [["field_a", "field_b", "field_c"],
2544          ["red", "赤", "23.8"],
2545          ["green", "緑", "0.0072"],
2546          ["white", "白", "1.65"],
2547          ["blue", "青", "12"]];
2548 
2549     string[][] data3x6ExpectedDistinctK1K3P60 =
2550         [["field_a", "field_b", "field_c"],
2551          ["green", "緑", "0.0072"],
2552          ["white", "白", "1.65"],
2553          ["blue", "青", "12"]];
2554 
2555     string[][] data3x6ExpectedDistinctK1K3P60Probs =
2556         [["random_value", "field_a", "field_b", "field_c"],
2557          ["0", "green", "緑", "0.0072"],
2558          ["0", "white", "白", "1.65"],
2559          ["0", "blue", "青", "12"]];
2560 
2561     string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom =
2562         [["custom_random_value_header", "field_a", "field_b", "field_c"],
2563          ["0", "green", "緑", "0.0072"],
2564          ["0", "white", "白", "1.65"],
2565          ["0", "blue", "青", "12"]];
2566 
2567     string[][] data3x6ExpectedDistinctK2P2ProbsInorder =
2568         [["random_value", "field_a", "field_b", "field_c"],
2569          ["1", "red", "赤", "23.8"],
2570          ["0", "green", "緑", "0.0072"],
2571          ["0", "white", "白", "1.65"],
2572          ["1", "yellow", "黄", "12"],
2573          ["3", "blue", "青", "12"],
2574          ["2", "black", "黒", "0.983"]];
2575 
2576     string[][] data3x6ExpectedPermuteWt3Probs =
2577         [["random_value", "field_a", "field_b", "field_c"],
2578          ["0.99665198757645390", "yellow", "黄", "12"],
2579          ["0.94775884809836686", "blue", "青", "12"],
2580          ["0.82728234682286661", "red", "赤", "23.8"],
2581          ["0.75346697377181959", "black", "黒", "0.983"],
2582          ["0.65130103496422487", "white", "白", "1.65"],
2583          ["1.5636943712879866e-111", "green", "緑", "0.0072"]];
2584 
2585     string[][] data3x6ExpectedWt3ProbsInorder =
2586         [["random_value", "field_a", "field_b", "field_c"],
2587          ["0.82728234682286661", "red", "赤", "23.8"],
2588          ["1.5636943712879866e-111", "green", "緑", "0.0072"],
2589          ["0.65130103496422487", "white", "白", "1.65"],
2590          ["0.99665198757645390", "yellow", "黄", "12"],
2591          ["0.94775884809836686", "blue", "青", "12"],
2592          ["0.75346697377181959", "black", "黒", "0.983"]];
2593 
2594     string[][] data3x6ExpectedPermuteWt3 =
2595         [["field_a", "field_b", "field_c"],
2596          ["yellow", "黄", "12"],
2597          ["blue", "青", "12"],
2598          ["red", "赤", "23.8"],
2599          ["black", "黒", "0.983"],
2600          ["white", "白", "1.65"],
2601          ["green", "緑", "0.0072"]];
2602 
2603 
2604     string[][] data3x6ExpectedReplaceNum10 =
2605         [["field_a", "field_b", "field_c"],
2606          ["black", "黒", "0.983"],
2607          ["green", "緑", "0.0072"],
2608          ["green", "緑", "0.0072"],
2609          ["red", "赤", "23.8"],
2610          ["yellow", "黄", "12"],
2611          ["red", "赤", "23.8"],
2612          ["white", "白", "1.65"],
2613          ["yellow", "黄", "12"],
2614          ["yellow", "黄", "12"],
2615          ["white", "白", "1.65"],
2616         ];
2617 
2618     string[][] data3x6ExpectedReplaceNum10V77 =
2619         [["field_a", "field_b", "field_c"],
2620          ["black", "黒", "0.983"],
2621          ["red", "赤", "23.8"],
2622          ["black", "黒", "0.983"],
2623          ["yellow", "黄", "12"],
2624          ["green", "緑", "0.0072"],
2625          ["green", "緑", "0.0072"],
2626          ["green", "緑", "0.0072"],
2627          ["yellow", "黄", "12"],
2628          ["blue", "青", "12"],
2629          ["white", "白", "1.65"],
2630         ];
2631 
2632     /* Using a different static seed. */
2633     string[][] data3x6ExpectedPermuteCompatV41Probs =
2634         [["random_value", "field_a", "field_b", "field_c"],
2635          ["0.68057272653095424", "green", "緑", "0.0072"],
2636          ["0.67681624367833138", "blue", "青", "12"],
2637          ["0.32097338931635022", "yellow", "黄", "12"],
2638          ["0.25092361867427826", "red", "赤", "23.8"],
2639          ["0.15535934292711318", "black", "黒", "0.983"],
2640          ["0.046095821075141430", "white", "白", "1.65"]];
2641 
2642     string[][] data3x6ExpectedBernoulliCompatP60V41Probs =
2643         [["random_value", "field_a", "field_b", "field_c"],
2644          ["0.25092361867427826", "red", "赤", "23.8"],
2645          ["0.046095821075141430", "white", "白", "1.65"],
2646          ["0.32097338931635022", "yellow", "黄", "12"],
2647          ["0.15535934292711318", "black", "黒", "0.983"]];
2648 
2649     string[][] data3x6ExpectedPermuteWt3V41Probs =
2650         [["random_value", "field_a", "field_b", "field_c"],
2651          ["0.96799377498910666", "blue", "青", "12"],
2652          ["0.94356245792573568", "red", "赤", "23.8"],
2653          ["0.90964601024271996", "yellow", "黄", "12"],
2654          ["0.15491658409260103", "white", "白", "1.65"],
2655          ["0.15043620392537033", "black", "黒", "0.983"],
2656          ["6.1394674830701461e-24", "green", "緑", "0.0072"]];
2657 
2658     string[][] data3x6ExpectedWt3V41ProbsInorder =
2659         [["random_value", "field_a", "field_b", "field_c"],
2660          ["0.94356245792573568", "red", "赤", "23.8"],
2661          ["6.1394674830701461e-24", "green", "緑", "0.0072"],
2662          ["0.15491658409260103", "white", "白", "1.65"],
2663          ["0.90964601024271996", "yellow", "黄", "12"],
2664          ["0.96799377498910666", "blue", "青", "12"],
2665          ["0.15043620392537033", "black", "黒", "0.983"]];
2666 
2667 
2668     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
2669     string[][] combo1ExpectedPermuteCompat =
2670         [["field_a", "field_b", "field_c"],
2671          ["yellow", "黄", "12"],
2672          ["tan", "タン", "8.5"],
2673          ["brown", "褐色", "29.2"],
2674          ["green", "緑", "0.0072"],
2675          ["red", "赤", "23.8"],
2676          ["purple", "紫の", "42"],
2677          ["black", "黒", "0.983"],
2678          ["white", "白", "1.65"],
2679          ["gray", "グレー", "6.2"],
2680          ["blue", "青", "12"],
2681          ["pink", "ピンク", "1.1"],
2682          ["orange", "オレンジ", "2.5"]];
2683 
2684     string[][] combo1ExpectedPermuteCompatProbs =
2685         [["random_value", "field_a", "field_b", "field_c"],
2686          ["0.97088520275428891", "yellow", "黄", "12"],
2687          ["0.96055546286515892", "tan", "タン", "8.5"],
2688          ["0.81756894313730299", "brown", "褐色", "29.2"],
2689          ["0.75710153928957880", "green", "緑", "0.0072"],
2690          ["0.52525980887003243", "red", "赤", "23.8"],
2691          ["0.49287854949943721", "purple", "紫の", "42"],
2692          ["0.47081507067196071", "black", "黒", "0.983"],
2693          ["0.38388182921335101", "white", "白", "1.65"],
2694          ["0.29215990612283349", "gray", "グレー", "6.2"],
2695          ["0.24033216014504433", "blue", "青", "12"],
2696          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2697          ["0.010968807619065046", "orange", "オレンジ", "2.5"]];
2698 
2699     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
2700     string[][] combo1ExpectedProbsInorder =
2701         [["random_value", "field_a", "field_b", "field_c"],
2702          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
2703          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2704          ["0.49287854949943721", "purple", "紫の", "42"],
2705          ["0.96055546286515892", "tan", "タン", "8.5"],
2706          ["0.52525980887003243", "red", "赤", "23.8"],
2707          ["0.75710153928957880", "green", "緑", "0.0072"],
2708          ["0.38388182921335101", "white", "白", "1.65"],
2709          ["0.97088520275428891", "yellow", "黄", "12"],
2710          ["0.24033216014504433", "blue", "青", "12"],
2711          ["0.47081507067196071", "black", "黒", "0.983"],
2712          ["0.81756894313730299", "brown", "褐色", "29.2"],
2713          ["0.29215990612283349", "gray", "グレー", "6.2"]];
2714 
2715     string[][] combo1ExpectedBernoulliCompatP50Probs =
2716         [["random_value", "field_a", "field_b", "field_c"],
2717          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
2718          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2719          ["0.49287854949943721", "purple", "紫の", "42"],
2720          ["0.38388182921335101", "white", "白", "1.65"],
2721          ["0.24033216014504433", "blue", "青", "12"],
2722          ["0.47081507067196071", "black", "黒", "0.983"],
2723          ["0.29215990612283349", "gray", "グレー", "6.2"]];
2724 
2725     string[][] combo1ExpectedBernoulliCompatP40 =
2726         [["field_a", "field_b", "field_c"],
2727          ["orange", "オレンジ", "2.5"],
2728          ["pink", "ピンク", "1.1"],
2729          ["white", "白", "1.65"],
2730          ["blue", "青", "12"],
2731          ["gray", "グレー", "6.2"]];
2732 
2733     string[][] combo1ExpectedDistinctK1P40 =
2734         [["field_a", "field_b", "field_c"],
2735          ["orange", "オレンジ", "2.5"],
2736          ["red", "赤", "23.8"],
2737          ["green", "緑", "0.0072"],
2738          ["blue", "青", "12"],
2739          ["black", "黒", "0.983"]];
2740 
2741     string[][] combo1ExpectedPermuteWt3Probs =
2742         [["random_value", "field_a", "field_b", "field_c"],
2743          ["0.99754077523718754", "yellow", "黄", "12"],
2744          ["0.99527665440088786", "tan", "タン", "8.5"],
2745          ["0.99312578945741659", "brown", "褐色", "29.2"],
2746          ["0.98329602553389361", "purple", "紫の", "42"],
2747          ["0.97330961938083660", "red", "赤", "23.8"],
2748          ["0.88797551521739648", "blue", "青", "12"],
2749          ["0.81999230489041786", "gray", "グレー", "6.2"],
2750          ["0.55975569204250941", "white", "白", "1.65"],
2751          ["0.46472135609205739", "black", "黒", "0.983"],
2752          ["0.18824582704191337", "pink", "ピンク", "1.1"],
2753          ["0.16446131853299920", "orange", "オレンジ", "2.5"],
2754          ["1.6438086931020549e-17", "green", "緑", "0.0072"]];
2755 
2756     string[][] combo1ExpectedPermuteWt3 =
2757         [["field_a", "field_b", "field_c"],
2758          ["yellow", "黄", "12"],
2759          ["tan", "タン", "8.5"],
2760          ["brown", "褐色", "29.2"],
2761          ["purple", "紫の", "42"],
2762          ["red", "赤", "23.8"],
2763          ["blue", "青", "12"],
2764          ["gray", "グレー", "6.2"],
2765          ["white", "白", "1.65"],
2766          ["black", "黒", "0.983"],
2767          ["pink", "ピンク", "1.1"],
2768          ["orange", "オレンジ", "2.5"],
2769          ["green", "緑", "0.0072"]];
2770 
2771         string[][] combo1ExpectedSampleAlgoRNum4 =
2772         [["field_a", "field_b", "field_c"],
2773          ["blue", "青", "12"],
2774          ["gray", "グレー", "6.2"],
2775          ["brown", "褐色", "29.2"],
2776          ["white", "白", "1.65"]];
2777 
2778         string[][] combo1ExpectedSampleAlgoRNum4Inorder =
2779         [["field_a", "field_b", "field_c"],
2780          ["white", "白", "1.65"],
2781          ["blue", "青", "12"],
2782          ["brown", "褐色", "29.2"],
2783          ["gray", "グレー", "6.2"]];
2784 
2785     string[][] combo1ExpectedReplaceNum10 =
2786         [["field_a", "field_b", "field_c"],
2787          ["gray", "グレー", "6.2"],
2788          ["yellow", "黄", "12"],
2789          ["yellow", "黄", "12"],
2790          ["white", "白", "1.65"],
2791          ["tan", "タン", "8.5"],
2792          ["white", "白", "1.65"],
2793          ["blue", "青", "12"],
2794          ["black", "黒", "0.983"],
2795          ["tan", "タン", "8.5"],
2796          ["purple", "紫の", "42"]];
2797 
2798     /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */
2799     string[][] data1x200 =
2800         [["field_a"],
2801          ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"],
2802          ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"],
2803          ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"],
2804          ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"],
2805          ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"],
2806          ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"],
2807          ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"],
2808          ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"],
2809          ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"],
2810          ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"],
2811          ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"],
2812          ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"],
2813          ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"],
2814          ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"],
2815          ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"],
2816          ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"],
2817          ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"],
2818          ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"],
2819          ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"],
2820          ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"],
2821         ];
2822 
2823     string fpath_data1x200 = buildPath(testDir, "data1x200.tsv");
2824     string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv");
2825     writeUnittestTsvFile(fpath_data1x200, data1x200);
2826     writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1 .. $]);
2827 
2828     string[][] data1x200ExpectedBernoulliSkipV333P01 =
2829         [["field_a"],
2830          ["077"],
2831          ["119"]];
2832 
2833     string[][] data1x200ExpectedBernoulliSkipV333P02 =
2834         [["field_a"],
2835          ["038"],
2836          ["059"],
2837          ["124"],
2838          ["161"],
2839          ["162"],
2840          ["183"]];
2841 
2842     string[][] data1x200ExpectedBernoulliSkipV333P03 =
2843         [["field_a"],
2844          ["025"],
2845          ["039"],
2846          ["082"],
2847          ["107"],
2848          ["108"],
2849          ["122"],
2850          ["136"],
2851          ["166"],
2852          ["182"]];
2853 
2854     string[][] data1x200ExpectedBernoulliCompatV333P01 =
2855         [["field_a"],
2856          ["072"]];
2857 
2858     string[][] data1x200ExpectedBernoulliCompatV333P02 =
2859         [["field_a"],
2860          ["004"],
2861          ["072"]];
2862 
2863     string[][] data1x200ExpectedBernoulliCompatV333P03 =
2864         [["field_a"],
2865          ["004"],
2866          ["072"],
2867          ["181"]];
2868 
2869     /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files,
2870      * only expected results. The header is from 3x0, the results are offset 1-position
2871      * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line.
2872      */
2873     string[][] combo2ExpectedBernoulliSkipV333P03 =
2874         [["field_a", "field_b", "field_c"],
2875          ["024"],
2876          ["038"],
2877          ["081"],
2878          ["106"],
2879          ["107"],
2880          ["121"],
2881          ["135"],
2882          ["165"],
2883          ["181"]];
2884 
2885 
2886     /* 1x10 - Simple 1-column file. */
2887     string[][] data1x10 =
2888         [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]];
2889     string fpath_data1x10 = buildPath(testDir, "data1x10.tsv");
2890     string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv");
2891     writeUnittestTsvFile(fpath_data1x10, data1x10);
2892     writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1 .. $]);
2893 
2894     string[][] data1x10ExpectedPermuteCompat =
2895         [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]];
2896 
2897     string[][] data1x10ExpectedPermuteWt1 =
2898         [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]];
2899 
2900     /* 2x10a - Uniform distribution [0,1]. */
2901     string[][] data2x10a =
2902         [["line", "weight"],
2903          ["1", "0.26788837"],
2904          ["2", "0.06601298"],
2905          ["3", "0.38627527"],
2906          ["4", "0.47379424"],
2907          ["5", "0.02966641"],
2908          ["6", "0.05636231"],
2909          ["7", "0.70529242"],
2910          ["8", "0.91836862"],
2911          ["9", "0.99103720"],
2912          ["10", "0.31401740"]];
2913 
2914     string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv");
2915     writeUnittestTsvFile(fpath_data2x10a, data2x10a);
2916 
2917     string[][] data2x10aExpectedPermuteWt2Probs =
2918         [["random_value", "line", "weight"],
2919          ["0.96833865494543658", "8", "0.91836862"],
2920          ["0.91856842054413923", "4", "0.47379424"],
2921          ["0.25730832087795091", "7", "0.70529242"],
2922          ["0.23725317907018120", "9", "0.99103720"],
2923          ["0.16016096701872204", "3", "0.38627527"],
2924          ["0.090819662667243381", "10", "0.31401740"],
2925          ["0.0071764539244361172", "6", "0.05636231"],
2926          ["0.000000048318642951630057", "1", "0.26788837"],
2927          ["0.00000000037525692966535517", "5", "0.02966641"],
2928          ["8.2123247880095796e-13", "2", "0.06601298"]];
2929 
2930     /* 2x10b - Uniform distribution [0,1000]. */
2931     string[][] data2x10b =
2932         [["line", "weight"],
2933          ["1", "761"],
2934          ["2", "432"],
2935          ["3", "103"],
2936          ["4", "448"],
2937          ["5", "750"],
2938          ["6", "711"],
2939          ["7", "867"],
2940          ["8", "841"],
2941          ["9", "963"],
2942          ["10", "784"]];
2943 
2944     string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv");
2945     writeUnittestTsvFile(fpath_data2x10b, data2x10b);
2946 
2947     string[][] data2x10bExpectedPermuteWt2Probs =
2948         [["random_value", "line", "weight"],
2949          ["0.99996486739067969", "8", "841"],
2950          ["0.99991017467137211", "4", "448"],
2951          ["0.99960871524873662", "6", "711"],
2952          ["0.99914188537143800", "5", "750"],
2953          ["0.99903963250274785", "10", "784"],
2954          ["0.99889631825931946", "7", "867"],
2955          ["0.99852058315191139", "9", "963"],
2956          ["0.99575669679158918", "2", "432"],
2957          ["0.99408758732050595", "1", "761"],
2958          ["0.99315467761212362", "3", "103"]];
2959 
2960     /* 2x10c - Logarithmic distribution in random order. */
2961     string[][] data2x10c =
2962         [["line", "weight"],
2963          ["1", "31.85"],
2964          ["2", "17403.31"],
2965          ["3", "653.84"],
2966          ["4", "8.23"],
2967          ["5", "2671.04"],
2968          ["6", "26226.08"],
2969          ["7", "1.79"],
2970          ["8", "354.56"],
2971          ["9", "35213.81"],
2972          ["10", "679.29"]];
2973 
2974     string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv");
2975     writeUnittestTsvFile(fpath_data2x10c, data2x10c);
2976 
2977     string[][] data2x10cExpectedPermuteWt2Probs =
2978         [["random_value", "line", "weight"],
2979          ["0.99998939008709697", "6", "26226.08"],
2980          ["0.99995951291695517", "9", "35213.81"],
2981          ["0.99991666907613541", "8", "354.56"],
2982          ["0.99989445052186410", "2", "17403.31"],
2983          ["0.99975897602861630", "5", "2671.04"],
2984          ["0.99891852769877643", "3", "653.84"],
2985          ["0.99889167752782515", "10", "679.29"],
2986          ["0.99512207506850148", "4", "8.23"],
2987          ["0.86789371584259023", "1", "31.85"],
2988          ["0.58574438162915610", "7", "1.79"]];
2989 
2990     /* 2x10d. Logarithmic distribution in ascending order. */
2991     string[][] data2x10d =
2992         [["line", "weight"],
2993          ["1", "1.79"],
2994          ["2", "8.23"],
2995          ["3", "31.85"],
2996          ["4", "354.56"],
2997          ["5", "653.84"],
2998          ["6", "679.29"],
2999          ["7", "2671.04"],
3000          ["8", "17403.31"],
3001          ["9", "26226.08"],
3002          ["10", "35213.81"]];
3003 
3004     string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv");
3005     writeUnittestTsvFile(fpath_data2x10d, data2x10d);
3006 
3007     string[][] data2x10dExpectedPermuteWt2Probs =
3008         [["random_value", "line", "weight"],
3009          ["0.99999830221846353", "8", "17403.31"],
3010          ["0.99997860834041397", "10", "35213.81"],
3011          ["0.99994563828986716", "9", "26226.08"],
3012          ["0.99988650363575737", "4", "354.56"],
3013          ["0.99964161939190088", "7", "2671.04"],
3014          ["0.99959045338948649", "6", "679.29"],
3015          ["0.99901574490639788", "5", "653.84"],
3016          ["0.97803163304747431", "3", "31.85"],
3017          ["0.79994791806910948", "2", "8.23"],
3018          ["0.080374261239949119", "1", "1.79"]];
3019 
3020     /* 2x10e. Logarithmic distribution in descending order. */
3021     string[][] data2x10e =
3022         [["line", "weight"],
3023          ["1", "35213.81"],
3024          ["2", "26226.08"],
3025          ["3", "17403.31"],
3026          ["4", "2671.04"],
3027          ["5", "679.29"],
3028          ["6", "653.84"],
3029          ["7", "354.56"],
3030          ["8", "31.85"],
3031          ["9", "8.23"],
3032          ["10", "1.79"]];
3033     string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv");
3034     writeUnittestTsvFile(fpath_data2x10e, data2x10e);
3035 
3036     string[][] data2x10eExpectedPermuteWt2Probs =
3037         [["random_value", "line", "weight"],
3038          ["0.99998493348975237", "4", "2671.04"],
3039          ["0.99995934807202624", "3", "17403.31"],
3040          ["0.99992995739727453", "2", "26226.08"],
3041          ["0.99987185679245649", "1", "35213.81"],
3042          ["0.99957451563173938", "6", "653.84"],
3043          ["0.99907273650209583", "8", "31.85"],
3044          ["0.99905260312968946", "5", "679.29"],
3045          ["0.99730333650516401", "7", "354.56"],
3046          ["0.84093902435227808", "9", "8.23"],
3047          ["0.65650015926290028", "10", "1.79"]];
3048 
3049     /* Data sets for distinct sampling. */
3050     string[][] data5x25 =
3051         [["ID", "Shape", "Color", "Size", "Weight"],
3052          ["01", "circle", "red", "S", "10"],
3053          ["02", "circle", "black", "L", "20"],
3054          ["03", "square", "black", "L", "20"],
3055          ["04", "circle", "green", "L", "30"],
3056          ["05", "ellipse", "red", "S", "20"],
3057          ["06", "triangle", "red", "S", "10"],
3058          ["07", "triangle", "red", "L", "20"],
3059          ["08", "square", "black", "S", "10"],
3060          ["09", "circle", "black", "S", "20"],
3061          ["10", "square", "green", "L", "20"],
3062          ["11", "triangle", "red", "L", "20"],
3063          ["12", "circle", "green", "L", "30"],
3064          ["13", "ellipse", "red", "S", "20"],
3065          ["14", "circle", "green", "L", "30"],
3066          ["15", "ellipse", "red", "L", "30"],
3067          ["16", "square", "red", "S", "10"],
3068          ["17", "circle", "black", "L", "20"],
3069          ["18", "square", "red", "S", "20"],
3070          ["19", "square", "black", "L", "20"],
3071          ["20", "circle", "red", "S", "10"],
3072          ["21", "ellipse", "black", "L", "30"],
3073          ["22", "triangle", "red", "L", "30"],
3074          ["23", "circle", "green", "S", "20"],
3075          ["24", "square", "green", "L", "20"],
3076          ["25", "circle", "red", "S", "10"],
3077         ];
3078 
3079     string fpath_data5x25 = buildPath(testDir, "data5x25.tsv");
3080     string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv");
3081     writeUnittestTsvFile(fpath_data5x25, data5x25);
3082     writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1 .. $]);
3083 
3084     string[][] data5x25ExpectedDistinctK2P40 =
3085         [["ID", "Shape", "Color", "Size", "Weight"],
3086          ["03", "square", "black", "L", "20"],
3087          ["05", "ellipse", "red", "S", "20"],
3088          ["08", "square", "black", "S", "10"],
3089          ["10", "square", "green", "L", "20"],
3090          ["13", "ellipse", "red", "S", "20"],
3091          ["15", "ellipse", "red", "L", "30"],
3092          ["16", "square", "red", "S", "10"],
3093          ["18", "square", "red", "S", "20"],
3094          ["19", "square", "black", "L", "20"],
3095          ["21", "ellipse", "black", "L", "30"],
3096          ["24", "square", "green", "L", "20"],
3097         ];
3098 
3099     string[][] data5x25ExpectedDistinctK2K4P20 =
3100         [["ID", "Shape", "Color", "Size", "Weight"],
3101          ["03", "square", "black", "L", "20"],
3102          ["07", "triangle", "red", "L", "20"],
3103          ["08", "square", "black", "S", "10"],
3104          ["10", "square", "green", "L", "20"],
3105          ["11", "triangle", "red", "L", "20"],
3106          ["16", "square", "red", "S", "10"],
3107          ["18", "square", "red", "S", "20"],
3108          ["19", "square", "black", "L", "20"],
3109          ["22", "triangle", "red", "L", "30"],
3110          ["24", "square", "green", "L", "20"],
3111         ];
3112 
3113     string[][] data5x25ExpectedDistinctK2K3K4P20 =
3114         [["ID", "Shape", "Color", "Size", "Weight"],
3115          ["04", "circle", "green", "L", "30"],
3116          ["07", "triangle", "red", "L", "20"],
3117          ["09", "circle", "black", "S", "20"],
3118          ["11", "triangle", "red", "L", "20"],
3119          ["12", "circle", "green", "L", "30"],
3120          ["14", "circle", "green", "L", "30"],
3121          ["16", "square", "red", "S", "10"],
3122          ["18", "square", "red", "S", "20"],
3123          ["22", "triangle", "red", "L", "30"],
3124         ];
3125 
3126     /* Fields 2 and 4 from data5x25. Distinct rows should be the same for equiv keys. */
3127     string[][] data2x25 =
3128         [["Shape", "Size"],
3129          ["circle", "S"],
3130          ["circle", "L"],
3131          ["square", "L"],
3132          ["circle", "L"],
3133          ["ellipse", "S"],
3134          ["triangle", "S"],
3135          ["triangle", "L"],
3136          ["square", "S"],
3137          ["circle", "S"],
3138          ["square", "L"],
3139          ["triangle", "L"],
3140          ["circle", "L"],
3141          ["ellipse", "S"],
3142          ["circle", "L"],
3143          ["ellipse", "L"],
3144          ["square", "S"],
3145          ["circle", "L"],
3146          ["square", "S"],
3147          ["square", "L"],
3148          ["circle", "S"],
3149          ["ellipse", "L"],
3150          ["triangle", "L"],
3151          ["circle", "S"],
3152          ["square", "L"],
3153          ["circle", "S"],
3154         ];
3155 
3156     string fpath_data2x25 = buildPath(testDir, "data2x25.tsv");
3157     string fpath_data2x25_noheader = buildPath(testDir, "data2x25_noheader.tsv");
3158     writeUnittestTsvFile(fpath_data2x25, data2x25);
3159     writeUnittestTsvFile(fpath_data2x25_noheader, data2x25[1 .. $]);
3160 
3161     string[][] data2x25ExpectedDistinctK1K2P20 =
3162         [["Shape", "Size"],
3163          ["square", "L"],
3164          ["triangle", "L"],
3165          ["square", "S"],
3166          ["square", "L"],
3167          ["triangle", "L"],
3168          ["square", "S"],
3169          ["square", "S"],
3170          ["square", "L"],
3171          ["triangle", "L"],
3172          ["square", "L"],
3173         ];
3174 
3175     string[][] data1x25 =
3176         [["Shape-Size"],
3177          ["circle-S"],
3178          ["circle-L"],
3179          ["square-L"],
3180          ["circle-L"],
3181          ["ellipse-S"],
3182          ["triangle-S"],
3183          ["triangle-L"],
3184          ["square-S"],
3185          ["circle-S"],
3186          ["square-L"],
3187          ["triangle-L"],
3188          ["circle-L"],
3189          ["ellipse-S"],
3190          ["circle-L"],
3191          ["ellipse-L"],
3192          ["square-S"],
3193          ["circle-L"],
3194          ["square-S"],
3195          ["square-L"],
3196          ["circle-S"],
3197          ["ellipse-L"],
3198          ["triangle-L"],
3199          ["circle-S"],
3200          ["square-L"],
3201          ["circle-S"],
3202         ];
3203 
3204     string fpath_data1x25 = buildPath(testDir, "data1x25.tsv");
3205     string fpath_data1x25_noheader = buildPath(testDir, "data1x25_noheader.tsv");
3206     writeUnittestTsvFile(fpath_data1x25, data1x25);
3207     writeUnittestTsvFile(fpath_data1x25_noheader, data1x25[1 .. $]);
3208 
3209     string[][] data1x25ExpectedDistinctK1P20 =
3210         [["Shape-Size"],
3211          ["triangle-L"],
3212          ["square-S"],
3213          ["triangle-L"],
3214          ["ellipse-L"],
3215          ["square-S"],
3216          ["square-S"],
3217          ["ellipse-L"],
3218          ["triangle-L"],
3219         ];
3220 
3221     string[][] data1x25ExpectedDistinctK1P20Probs =
3222         [["random_value", "Shape-Size"],
3223          ["0", "triangle-L"],
3224          ["0", "square-S"],
3225          ["0", "triangle-L"],
3226          ["0", "ellipse-L"],
3227          ["0", "square-S"],
3228          ["0", "square-S"],
3229          ["0", "ellipse-L"],
3230          ["0", "triangle-L"],
3231         ];
3232 
3233     string[][] data1x25ExpectedDistinctK1P20ProbsInorder =
3234         [["random_value", "Shape-Size"],
3235          ["1", "circle-S"],
3236          ["4", "circle-L"],
3237          ["2", "square-L"],
3238          ["4", "circle-L"],
3239          ["2", "ellipse-S"],
3240          ["1", "triangle-S"],
3241          ["0", "triangle-L"],
3242          ["0", "square-S"],
3243          ["1", "circle-S"],
3244          ["2", "square-L"],
3245          ["0", "triangle-L"],
3246          ["4", "circle-L"],
3247          ["2", "ellipse-S"],
3248          ["4", "circle-L"],
3249          ["0", "ellipse-L"],
3250          ["0", "square-S"],
3251          ["4", "circle-L"],
3252          ["0", "square-S"],
3253          ["2", "square-L"],
3254          ["1", "circle-S"],
3255          ["0", "ellipse-L"],
3256          ["0", "triangle-L"],
3257          ["1", "circle-S"],
3258          ["2", "square-L"],
3259          ["1", "circle-S"],
3260         ];
3261 
3262     /*
3263      * Enough setup! Actually run some tests!
3264      */
3265 
3266     /* Shuffling tests. Headers, static seed, compatibility mode. With weights and without. */
3267     testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty);
3268     testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0);
3269     testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1);
3270     testTsvSample(["test-a4", "-H", "-s", "--compatibility-mode", fpath_data3x2], data3x2PermuteCompat);
3271     testTsvSample(["test-a5", "-H", "-s", "--compatibility-mode", fpath_data3x3], data3x3ExpectedPermuteCompat);
3272     testTsvSample(["test-a6", "-H", "-s", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat);
3273     testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
3274     testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3);
3275     testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
3276     testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
3277     testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
3278     testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
3279     testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs);
3280 
3281     /* Shuffling, without compatibility mode, or with both compatibility and printing. */
3282     testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty);
3283     testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0);
3284     testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1);
3285     testTsvSample(["test-aa4", "-H", "-s", fpath_data3x2], data3x2PermuteShuffle);
3286     testTsvSample(["test-aa5", "-H", "-s", fpath_data3x3], data3x3ExpectedPermuteSwap);
3287     testTsvSample(["test-aa6", "-H", "-s", fpath_data3x6], data3x6ExpectedPermuteSwap);
3288     testTsvSample(["test-aa7", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3);
3289     testTsvSample(["test-aa8", "-H", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
3290     testTsvSample(["test-aa9", "-H", "--seed-value", "41", "--print-random", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
3291 
3292     /* Reservoir sampling using Algorithm R.
3293      * (Note: reservoirSamplingViaHeap is tested later in the length-based iteration loops.)
3294      */
3295     testTsvSample(["test-aa10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty);
3296     testTsvSample(["test-aa11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty);
3297     testTsvSample(["test-aa12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x0], data3x0);
3298     testTsvSample(["test-aa13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x0], data3x0);
3299     testTsvSample(["test-aa14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x1], data3x1);
3300     testTsvSample(["test-aa15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x1], data3x1);
3301     testTsvSample(["test-aa16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6);
3302     testTsvSample(["test-aa17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6);
3303     testTsvSample(["test-aa18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5);
3304     testTsvSample(["test-aa19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4);
3305     testTsvSample(["test-aa20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3);
3306     testTsvSample(["test-aa21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2);
3307     testTsvSample(["test-aa22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1);
3308 
3309     /* Inorder versions of Algorithm R tests. */
3310     testTsvSample(["test-ai10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty);
3311     testTsvSample(["test-ai11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty);
3312     testTsvSample(["test-ai12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0);
3313     testTsvSample(["test-ai13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0);
3314     testTsvSample(["test-ai14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1);
3315     testTsvSample(["test-ai15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1);
3316     testTsvSample(["test-ai16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder);
3317     testTsvSample(["test-ai17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder);
3318     testTsvSample(["test-ai18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5Inorder);
3319     testTsvSample(["test-ai19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4Inorder);
3320     testTsvSample(["test-ai20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3Inorder);
3321     testTsvSample(["test-ai21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2Inorder);
3322     testTsvSample(["test-ai22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1Inorder);
3323 
3324     /* Bernoulli sampling cases. */
3325     testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty);
3326     testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0);
3327     testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1);
3328     testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6);
3329     testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6);
3330     testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
3331     testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60);
3332     testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliCompatP60);
3333     testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs);
3334 
3335     /* Bernoulli sampling with probabilities in skip sampling range or preferring skip sampling. */
3336     testTsvSample(["test-ab1", "-H", "--seed-value", "333", "--prob", "0.01", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P01);
3337     testTsvSample(["test-ab2", "-H", "--seed-value", "333", "--prob", "0.02", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P02);
3338     testTsvSample(["test-ab3", "-H", "--seed-value", "333", "--prob", "0.03", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03);
3339     testTsvSample(["test-ab4", "-H", "--seed-value", "333", "--prob", "0.01", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P01);
3340     testTsvSample(["test-ab5", "-H", "--seed-value", "333", "--prob", "0.02", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P02);
3341     testTsvSample(["test-ab6", "-H", "--seed-value", "333", "--prob", "0.03", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P03);
3342     testTsvSample(["test-ab7", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40);
3343 
3344     /* Distinct sampling cases. */
3345     testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty);
3346     testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0);
3347     testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1);
3348     testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6);
3349     testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctK1K3P60);
3350 
3351 
3352     /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling.
3353      * For weighted sampling, use the weighted cases, but with expected using the original ordering.
3354      */
3355     testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
3356     testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
3357     testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
3358                   data3x6ExpectedWt3ProbsInorder);
3359     testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
3360                   data3x6ExpectedWt3V41ProbsInorder);
3361     testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6],
3362                   data3x6ExpectedDistinctK1K3P60Probs);
3363     testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header",
3364                    "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctK1K3P60ProbsRVCustom);
3365     testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6],
3366                   data3x6ExpectedDistinctK2P2ProbsInorder);
3367 
3368     /* Simple random sampling with replacement. */
3369     testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty);
3370     testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty);
3371     testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0);
3372     testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0);
3373     testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplaceNum3);
3374     testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10);
3375     testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77);
3376 
3377     /* Shuffling, compatibility mode, without headers. */
3378     testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1 .. $]);
3379     testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1 .. $]);
3380     testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1 .. $]);
3381     testTsvSample(["test-b4", "-s", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1 .. $]);
3382     testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1 .. $]);
3383     testTsvSample(["test-b6", "-s", "--weight-field", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]);
3384     testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]);
3385     testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]);
3386     testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1 .. $]);
3387 
3388     /* Shuffling, no headers, without compatibility mode, or with printing and compatibility mode. */
3389     testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1 .. $]);
3390     testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1 .. $]);
3391     testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1 .. $]);
3392     testTsvSample(["test-bb4", "-s", fpath_data3x6_noheader], data3x6ExpectedPermuteSwap[1 .. $]);
3393     testTsvSample(["test-bb5", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]);
3394     testTsvSample(["test-bb6", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]);
3395     testTsvSample(["test-bb7", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]);
3396 
3397     /* Reservoir sampling using Algorithm R, no headers. */
3398     testTsvSample(["test-ac10", "--prefer-algorithm-r", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty);
3399     testTsvSample(["test-ac11", "--prefer-algorithm-r", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty);
3400     testTsvSample(["test-ac14", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x1_noheader], data3x1[1 .. $]);
3401     testTsvSample(["test-ac15", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x1_noheader], data3x1[1 .. $]);
3402     testTsvSample(["test-ac16", "--prefer-algorithm-r", "-s", "--num", "7", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]);
3403     testTsvSample(["test-ac17", "--prefer-algorithm-r", "-s", "--num", "6", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]);
3404     testTsvSample(["test-ac18", "--prefer-algorithm-r", "-s", "--num", "5", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5[1 .. $]);
3405     testTsvSample(["test-ac19", "--prefer-algorithm-r", "-s", "--num", "4", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4[1 .. $]);
3406     testTsvSample(["test-ac20", "--prefer-algorithm-r", "-s", "--num", "3", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3[1 .. $]);
3407     testTsvSample(["test-ac21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2[1 .. $]);
3408     testTsvSample(["test-ac22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1[1 .. $]);
3409 
3410     /* Reservoir sampling using Algorithm R, no headers, inorder output. */
3411     testTsvSample(["test-aj10", "--prefer-algorithm-r", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty);
3412     testTsvSample(["test-aj11", "--prefer-algorithm-r", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty);
3413     testTsvSample(["test-aj14", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3414     testTsvSample(["test-aj15", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3415     testTsvSample(["test-aj16", "--prefer-algorithm-r", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]);
3416     testTsvSample(["test-aj17", "--prefer-algorithm-r", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]);
3417     testTsvSample(["test-aj18", "--prefer-algorithm-r", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5Inorder[1 .. $]);
3418     testTsvSample(["test-aj19", "--prefer-algorithm-r", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4Inorder[1 .. $]);
3419     testTsvSample(["test-aj20", "--prefer-algorithm-r", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3Inorder[1 .. $]);
3420     testTsvSample(["test-aj21", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2Inorder[1 .. $]);
3421     testTsvSample(["test-aj22", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1Inorder[1 .. $]);
3422 
3423     /* Bernoulli sampling cases. */
3424     testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1 .. $]);
3425     testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]);
3426     testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]);
3427     testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]);
3428     testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1 .. $]);
3429     testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1 .. $]);
3430 
3431     /* Bernoulli sampling with probabilities in skip sampling range. */
3432     testTsvSample(["test-bb1", "-v", "333", "-p", "0.01", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P01[1 .. $]);
3433     testTsvSample(["test-bb2", "-v", "333", "-p", "0.02", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P02[1 .. $]);
3434     testTsvSample(["test-bb3", "-v", "333", "-p", "0.03", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1 .. $]);
3435     testTsvSample(["test-bb4", "-v", "333", "-p", "0.01", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P01[1 .. $]);
3436     testTsvSample(["test-bb5", "-v", "333", "-p", "0.02", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P02[1 .. $]);
3437     testTsvSample(["test-bb6", "-v", "333", "-p", "0.03", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P03[1 .. $]);
3438     testTsvSample(["test-bb7", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1 .. $]);
3439 
3440     /* Distinct sampling cases. */
3441     testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1 .. $]);
3442     testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]);
3443     testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]);
3444     testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]);
3445 
3446     /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */
3447     testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]);
3448     testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1 .. $]);
3449     testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader],
3450                   data3x6ExpectedDistinctK1K3P60Probs[1 .. $]);
3451     testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader],
3452                   data3x6ExpectedDistinctK2P2ProbsInorder[1 .. $]);
3453 
3454     /* Simple random sampling with replacement. */
3455     testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty);
3456     testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty);
3457     testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplaceNum3[1 .. $]);
3458     testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10[1 .. $]);
3459     testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1 .. $]);
3460 
3461     /* Multi-file tests. */
3462     testTsvSample(["test-c1", "--header", "--static-seed", "--compatibility-mode",
3463                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3464                   combo1ExpectedPermuteCompat);
3465     testTsvSample(["test-c2", "--header", "--static-seed", "--print-random",
3466                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3467                   combo1ExpectedPermuteCompatProbs);
3468     testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3",
3469                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3470                   combo1ExpectedPermuteWt3Probs);
3471     testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", "--compatibility-mode",
3472                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3473                   combo1ExpectedPermuteWt3);
3474     testTsvSample(["test-c5", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4",
3475                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3476                   combo1ExpectedSampleAlgoRNum4);
3477     testTsvSample(["test-c5b", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder",
3478                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3479                   combo1ExpectedSampleAlgoRNum4Inorder);
3480 
3481     /* Multi-file, no headers. */
3482     testTsvSample(["test-c6", "--static-seed", "--compatibility-mode",
3483                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3484                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3485                   combo1ExpectedPermuteCompat[1 .. $]);
3486     testTsvSample(["test-c7", "--static-seed", "--print-random",
3487                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3488                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3489                   combo1ExpectedPermuteCompatProbs[1 .. $]);
3490     testTsvSample(["test-c8", "--static-seed", "--print-random", "--weight-field", "3",
3491                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3492                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3493                   combo1ExpectedPermuteWt3Probs[1 .. $]);
3494     testTsvSample(["test-c9", "--static-seed", "--weight-field", "3", "--compatibility-mode",
3495                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3496                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3497                   combo1ExpectedPermuteWt3[1 .. $]);
3498     testTsvSample(["test-c10", "--static-seed", "--prefer-algorithm-r", "--num", "4",
3499                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3500                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3501                   combo1ExpectedSampleAlgoRNum4[1 .. $]);
3502     testTsvSample(["test-c10b", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder",
3503                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3504                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3505                   combo1ExpectedSampleAlgoRNum4Inorder[1 .. $]);
3506 
3507     /* Bernoulli sampling cases. */
3508     testTsvSample(["test-c11", "--header", "--static-seed", "--print-random", "--prob", ".5",
3509                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3510                   combo1ExpectedBernoulliCompatP50Probs);
3511     testTsvSample(["test-c12", "--header", "--static-seed", "--prob", ".4",
3512                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3513                   combo1ExpectedBernoulliCompatP40);
3514     testTsvSample(["test-c13", "--static-seed", "--print-random", "--prob", ".5",
3515                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3516                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3517                   combo1ExpectedBernoulliCompatP50Probs[1 .. $]);
3518     testTsvSample(["test-c14", "--static-seed", "--prob", ".4",
3519                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3520                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3521                   combo1ExpectedBernoulliCompatP40[1 .. $]);
3522 
3523     /* Bernoulli sampling with probabilities in skip sampling range. */
3524     testTsvSample(["test-cc1", "-H", "-v", "333", "-p", "0.03",
3525                    fpath_data3x0, fpath_data3x1, fpath_data1x200, fpath_dataEmpty, fpath_data1x10],
3526                   combo2ExpectedBernoulliSkipV333P03);
3527     testTsvSample(["test-cc1", "-v", "333", "-p", "0.03",
3528                    fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader],
3529                   combo2ExpectedBernoulliSkipV333P03[1 .. $]);
3530 
3531     /* Distinct sampling cases. */
3532     testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4",
3533                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3534                   combo1ExpectedDistinctK1P40);
3535     testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--prob", ".4",
3536                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3537                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3538                   combo1ExpectedDistinctK1P40[1 .. $]);
3539 
3540     /* Generating random weights. */
3541     testTsvSample(["test-c15", "--header", "--static-seed", "--gen-random-inorder",
3542                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3543                   combo1ExpectedProbsInorder);
3544     testTsvSample(["test-c16", "--static-seed", "--gen-random-inorder",
3545                    fpath_data3x3_noheader, fpath_data3x1_noheader,
3546                    fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader],
3547                   combo1ExpectedProbsInorder[1 .. $]);
3548 
3549     /* Simple random sampling with replacement. */
3550     testTsvSample(["test-c17", "--header", "--static-seed", "--replace", "--num", "10",
3551                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3552                   combo1ExpectedReplaceNum10);
3553 
3554     testTsvSample(["test-c18", "--static-seed", "--replace", "--num", "10",
3555                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3556                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3557                   combo1ExpectedReplaceNum10[1 .. $]);
3558 
3559     /* Single column file. */
3560     testTsvSample(["test-d1", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat);
3561     testTsvSample(["test-d2", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat);
3562 
3563     /* Distributions. */
3564     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs);
3565     testTsvSample(["test-e2", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedPermuteWt2Probs);
3566     testTsvSample(["test-e3", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedPermuteWt2Probs);
3567     testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs);
3568     testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs);
3569 
3570     /* Tests of subset sample (--n|num) field. Random sampling, Bernoulli sampling, distinct sampling.
3571      *
3572      * Note: The way these tests are done ensures that subset length does not affect
3573      * output order.
3574      */
3575     import std.algorithm : min;
3576     for (size_t n = data3x6.length + 2; n >= 1; n--)
3577     {
3578         /* reservoirSamplingViaHeap.
3579          */
3580         size_t expectedLength = min(data3x6.length, n + 1);
3581         testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string,
3582                        "-H", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]);
3583 
3584         testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string,
3585                        "-H", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]);
3586 
3587         testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string,
3588                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs[0..expectedLength]);
3589 
3590         testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string,
3591                        "-H", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3[0..expectedLength]);
3592 
3593         testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string,
3594                        "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs[0..expectedLength]);
3595 
3596         testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string,
3597                        fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..expectedLength]);
3598 
3599         testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string,
3600                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..expectedLength]);
3601 
3602         testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string,
3603                        "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..expectedLength]);
3604 
3605         testTsvSample([format("test-f9_%d", n), "-s", "-n", n.to!string,
3606                        "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..expectedLength]);
3607 
3608         /* Bernoulli sampling.
3609          */
3610         import std.algorithm : min;
3611         size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedBernoulliCompatProbsP60.length);
3612 
3613         testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3614                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60[0..sampleExpectedLength]);
3615 
3616         testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3617                        "-H", fpath_data3x6], data3x6ExpectedBernoulliCompatP60[0..sampleExpectedLength]);
3618 
3619         testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3620                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..sampleExpectedLength]);
3621 
3622         testTsvSample([format("test-f13_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3623                        fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60[1..sampleExpectedLength]);
3624 
3625         /* Distinct Sampling.
3626          */
3627         size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctK1K3P60.length);
3628 
3629         testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
3630                        "-H", fpath_data3x6], data3x6ExpectedDistinctK1K3P60[0..distinctExpectedLength]);
3631 
3632         testTsvSample([format("test-f15_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
3633                        fpath_data3x6_noheader], data3x6ExpectedDistinctK1K3P60[1..distinctExpectedLength]);
3634 
3635         testTsvSample([format("test-f16_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
3636                        "-H", fpath_data3x6], data3x6ExpectedBernoulliProbsP100[0..expectedLength]);
3637 
3638         testTsvSample([format("test-f17_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
3639                        fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]);
3640     }
3641 
3642     /* Similar tests with the 1x10 data set. */
3643     for (size_t n = data1x10.length + 2; n >= 1; n--)
3644     {
3645         size_t expectedLength = min(data1x10.length, n + 1);
3646         testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string,
3647                        "-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]);
3648 
3649         testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string,
3650                        "-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]);
3651 
3652         testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string,
3653                        fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]);
3654 
3655         testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string,
3656                        "-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]);
3657     }
3658 
3659     /* Simple random sampling with replacement: ensure sample size doesn't change order. */
3660     for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--)
3661     {
3662         testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6],
3663                       data3x6ExpectedReplaceNum10[0 .. n + 1]);
3664 
3665         testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader],
3666                       data3x6ExpectedReplaceNum10[1 .. n + 1]);
3667     }
3668 
3669     /* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */
3670     for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--)
3671     {
3672         size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1);
3673 
3674         testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
3675                        "-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]);
3676 
3677         testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
3678                        fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]);
3679     }
3680 
3681     /* Inorder sampling tests using reservoir sampling via heap (compatibility mode). */
3682     testTsvSample(["test-ar10", "--compatibility-mode", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty);
3683     testTsvSample(["test-ar11", "--compatibility-mode", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty);
3684     testTsvSample(["test-ar12", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0);
3685     testTsvSample(["test-ar13", "--compatibility-mode", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0);
3686     testTsvSample(["test-ar14", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1);
3687     testTsvSample(["test-ar15", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1);
3688     testTsvSample(["test-ar16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder);
3689     testTsvSample(["test-ar17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder);
3690     testTsvSample(["test-ar18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum5Inorder);
3691     testTsvSample(["test-ar19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", fpath_data3x6],         data3x6ExpectedSampleCompatNum4Inorder);
3692     testTsvSample(["test-ar20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum3Inorder);
3693     testTsvSample(["test-ar21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum2Inorder);
3694     testTsvSample(["test-ar22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum1Inorder);
3695 
3696     testTsvSample(["test-as10", "--compatibility-mode", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty);
3697     testTsvSample(["test-as11", "--compatibility-mode", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty);
3698     testTsvSample(["test-as14", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3699     testTsvSample(["test-as15", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3700     testTsvSample(["test-as16", "--compatibility-mode", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]);
3701     testTsvSample(["test-as17", "--compatibility-mode", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]);
3702     testTsvSample(["test-as18", "--compatibility-mode", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5Inorder[1 .. $]);
3703     testTsvSample(["test-as19", "--compatibility-mode", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4Inorder[1 .. $]);
3704     testTsvSample(["test-as20", "--compatibility-mode", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3Inorder[1 .. $]);
3705     testTsvSample(["test-as21", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2Inorder[1 .. $]);
3706     testTsvSample(["test-as22", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1Inorder[1 .. $]);
3707 
3708     /* Inorder sampling tests with random number printing. --compatibility-mode not needed. */
3709     testTsvSample(["test-at16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder);
3710     testTsvSample(["test-at17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder);
3711     testTsvSample(["test-at18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum5ProbsInorder);
3712     testTsvSample(["test-at19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder);
3713     testTsvSample(["test-at19",                         "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder);
3714     testTsvSample(["test-at20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder);
3715     testTsvSample(["test-at20",                         "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder);
3716     testTsvSample(["test-at21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum2ProbsInorder);
3717     testTsvSample(["test-at22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum1ProbsInorder);
3718 
3719     testTsvSample(["test-au16", "--compatibility-mode", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]);
3720     testTsvSample(["test-au17", "--compatibility-mode", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]);
3721     testTsvSample(["test-au18", "--compatibility-mode", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5ProbsInorder[1 .. $]);
3722     testTsvSample(["test-au19", "--compatibility-mode", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]);
3723     testTsvSample(["test-au19",                         "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]);
3724     testTsvSample(["test-au20", "--compatibility-mode", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3ProbsInorder[1 .. $]);
3725     testTsvSample(["test-au21", "--compatibility-mode", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2ProbsInorder[1 .. $]);
3726     testTsvSample(["test-au22", "--compatibility-mode", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1ProbsInorder[1 .. $]);
3727 
3728     /* Inorder weighted sampling tests. */
3729     testTsvSample(["test-ax16", "-H", "-s", "-n", "7", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder);
3730     testTsvSample(["test-ax17", "-H", "-s", "-n", "6", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder);
3731     testTsvSample(["test-ax18", "-H", "-s", "-n", "5", "-i", fpath_data3x6], data3x6ExpectedWt3Num5Inorder);
3732     testTsvSample(["test-ax19", "-H", "-s", "-n", "4", "-i", fpath_data3x6], data3x6ExpectedWt3Num4Inorder);
3733     testTsvSample(["test-ax20", "-H", "-s", "-n", "3", "-i", fpath_data3x6], data3x6ExpectedWt3Num3Inorder);
3734     testTsvSample(["test-ax21", "-H", "-s", "-n", "2", "-i", fpath_data3x6], data3x6ExpectedWt3Num2Inorder);
3735     testTsvSample(["test-ax22", "-H", "-s", "-n", "1", "-i", fpath_data3x6], data3x6ExpectedWt3Num1Inorder);
3736 
3737     testTsvSample(["test-ay16", "-s", "-n", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]);
3738     testTsvSample(["test-ay17", "-s", "-n", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]);
3739     testTsvSample(["test-ay18", "-s", "-n", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num5Inorder[1 .. $]);
3740     testTsvSample(["test-ay19", "-s", "-n", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num4Inorder[1 .. $]);
3741     testTsvSample(["test-ay20", "-s", "-n", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num3Inorder[1 .. $]);
3742     testTsvSample(["test-ay21", "-s", "-n", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num2Inorder[1 .. $]);
3743     testTsvSample(["test-ay22", "-s", "-n", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num1Inorder[1 .. $]);
3744 
3745     /*
3746      * Distinct sampling tests.
3747      */
3748     testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25],
3749                   data5x25ExpectedDistinctK2P40);
3750 
3751     testTsvSample(["test-j2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25],
3752                   data5x25ExpectedDistinctK2K4P20);
3753 
3754     testTsvSample(["test-j3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25],
3755                   data5x25ExpectedDistinctK2K3K4P20);
3756 
3757     testTsvSample(["test-j4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader],
3758                   data5x25ExpectedDistinctK2P40[1 .. $]);
3759 
3760     testTsvSample(["test-j5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader],
3761                   data5x25ExpectedDistinctK2K4P20[1 .. $]);
3762 
3763     testTsvSample(["test-j6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader],
3764                   data5x25ExpectedDistinctK2K3K4P20[1 .. $]);
3765 
3766 
3767     /* These distinct tests check that the whole line as '-k 0' and specifying all fields
3768      * in order have the same result. Also that field numbers don't matter, as '-k 1,2'
3769      * in data2x25 are the same keys as '-k 2,4' in data5x25.
3770      */
3771     testTsvSample(["test-j7", "-H", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25],
3772                   data2x25ExpectedDistinctK1K2P20);
3773 
3774     testTsvSample(["test-j8", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data2x25],
3775                   data2x25ExpectedDistinctK1K2P20);
3776 
3777     testTsvSample(["test-j9", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25_noheader],
3778                   data2x25ExpectedDistinctK1K2P20[1 .. $]);
3779 
3780     testTsvSample(["test-j10", "-s", "-p", "0.20", "-k", "0", fpath_data2x25_noheader],
3781                   data2x25ExpectedDistinctK1K2P20[1 .. $]);
3782 
3783     /* Similar to the last set, but for a 1-column file. Also with random value printing. */
3784     testTsvSample(["test-j11", "-H", "-s", "-p", "0.20", "-k", "1", fpath_data1x25],
3785                   data1x25ExpectedDistinctK1P20);
3786 
3787     testTsvSample(["test-j12", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data1x25],
3788                   data1x25ExpectedDistinctK1P20);
3789 
3790     testTsvSample(["test-j13", "-s", "-p", "0.20", "-k", "1", fpath_data1x25_noheader],
3791                   data1x25ExpectedDistinctK1P20[1 .. $]);
3792 
3793     testTsvSample(["test-j14", "-s", "-p", "0.20", "-k", "0", fpath_data1x25_noheader],
3794                   data1x25ExpectedDistinctK1P20[1 .. $]);
3795 
3796 
3797     testTsvSample(["test-j15", "-H", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25],
3798                   data1x25ExpectedDistinctK1P20Probs);
3799 
3800     testTsvSample(["test-j16", "-H", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25],
3801                   data1x25ExpectedDistinctK1P20Probs);
3802 
3803     testTsvSample(["test-j17", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25_noheader],
3804                   data1x25ExpectedDistinctK1P20Probs[1 .. $]);
3805 
3806     testTsvSample(["test-j18", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25_noheader],
3807                   data1x25ExpectedDistinctK1P20Probs[1 .. $]);
3808 
3809 
3810     testTsvSample(["test-j19", "-H", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25],
3811                   data1x25ExpectedDistinctK1P20ProbsInorder);
3812 
3813     testTsvSample(["test-j20", "-H", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25],
3814                   data1x25ExpectedDistinctK1P20ProbsInorder);
3815 
3816     testTsvSample(["test-j21", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25_noheader],
3817                   data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]);
3818 
3819     testTsvSample(["test-j22", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25_noheader],
3820                   data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]);
3821 
3822 }