tsv_utils.tsv_sample source code

1 /**
2 Command line tool for randomizing or sampling lines from input streams. Several
3 sampling methods are available, including simple random sampling, weighted random
4 sampling, Bernoulli sampling, and distinct sampling.
5 
6 Copyright (c) 2017-2018, eBay Software Foundation
7 Initially written by Jon Degenhardt
8 
9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
10 */
11 module tsv_utils.tsv_sample;
12 
13 import std.range;
14 import std.stdio;
15 import std.typecons : tuple, Flag;
16 
17 version(unittest)
18 {
19     // When running unit tests, use main from -main compiler switch.
20 }
21 else
22 {
23     int main(string[] cmdArgs)
24     {
25         /* When running in DMD code coverage mode, turn on report merging. */
26         version(D_Coverage) version(DigitalMars)
27         {
28             import core.runtime : dmd_coverSetMerge;
29             dmd_coverSetMerge(true);
30         }
31 
32         TsvSampleOptions cmdopt;
33         auto r = cmdopt.processArgs(cmdArgs);
34         if (!r[0]) return r[1];
35         version(LDC_Profile)
36         {
37             import ldc.profile : resetAll;
38             resetAll();
39         }
40         try
41         {
42             import tsv_utils.common.utils : BufferedOutputRange;
43             auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
44 
45             tsvSample(cmdopt, bufferedOutput);
46         }
47         catch (Exception exc)
48         {
49             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
50             return 1;
51         }
52         return 0;
53     }
54 }
55 
56 auto helpText = q"EOS
57 Synopsis: tsv-sample [options] [file...]
58 
59 Sample input lines or randomize their order. Several modes of operation
60 are available:
61 * Line order randomization (the default): All input lines are output in a
62   random order. All orderings are equally likely.
63 * Weighted line order randomization (--w|weight-field): Lines are selected
64   using weighted random sampling, with the weight taken from a field.
65   Lines are output in weighted selection order, reordering the lines.
66 * Sampling with replacement (--r|replace, --n|num): All input is read into
67   memory, then lines are repeatedly selected at random and written out. This
68   continues until --n|num samples are output. Lines can be selected multiple
69   times. Output continues forever if --n|num is zero or not specified.
70 * Bernoulli sampling (--p|prob): A random subset of lines is output based
71   on an inclusion probability. This is a streaming operation. A selection
72   decision is made on each line as is it read. Line order is not changed.
73 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled
74   based on the values in the key field. A subset of the keys are chosen
75   based on the inclusion probability (a 'distinct' set of keys). All lines
76   with one of the selected keys are output. Line order is not changed.
77 
78 The '--n|num' option limits the sample size produced. It speeds up line
79 order randomization and weighted sampling significantly. It is also used
80 to terminate sampling with replacement.
81 
82 Use '--help-verbose' for detailed information.
83 
84 Options:
85 EOS";
86 
87 auto helpTextVerbose = q"EOS
88 Synopsis: tsv-sample [options] [file...]
89 
90 Sample input lines or randomize their order. Several modes of operation
91 are available:
92 * Line order randomization (the default): All input lines are output in a
93   random order. All orderings are equally likely.
94 * Weighted line order randomization (--w|weight-field): Lines are selected
95   using weighted random sampling, with the weight taken from a field.
96   Lines are output in weighted selection order, reordering the lines.
97 * Sampling with replacement (--r|replace, --n|num): All input is read into
98   memory, then lines are repeatedly selected at random and written out. This
99   continues until --n|num samples are output. Lines can be selected multiple
100   times. Output continues forever if --n|num is zero or not specified.
101 * Bernoulli sampling (--p|prob): A random subset of lines is output based
102   on an inclusion probability. This is a streaming operation. A selection
103   decision is made on each line as is it read. Lines order is not changed.
104 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled
105   based on the values in the key field. A subset of the keys are chosen
106   based on the inclusion probability (a 'distinct' set of keys). All lines
107   with one of the selected keys are output. Line order is not changed.
108 
109 Sample size: The '--n|num' option limits the sample size produced. This
110 speeds up line order randomization and weighted sampling significantly
111 (details below). It is also used to terminate sampling with replacement.
112 
113 Controlling the random seed: By default, each run produces a different
114 randomization or sampling. Using '--s|static-seed' changes this so
115 multiple runs produce the same results. This works by using the same
116 random seed each run. The random seed can be specified using
117 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero
118 value is a no-op and ignored.)
119 
120 Memory use: Bernoulli sampling and distinct sampling make decisions on
121 each line as it is read, so there is no memory accumulation. These
122 algorithms support arbitrary size inputs. Sampling with replacement reads
123 all lines into memory and is limited by available memory. The line order
124 randomization algorithms hold the full output set in memory prior to
125 generating results. This ultimately limits the size of the output set. For
126 these memory needs can be reduced by using a sample size (--n|num). This
127 engages reservior sampling. Output order is not affected. Both
128 'tsv-sample -n 1000' and 'tsv-sample | head -n 1000' produce the same
129 results, but the former is quite a bit faster.
130 
131 Weighted sampling: Weighted random sampling is done using an algorithm
132 described by Pavlos Efraimidis and Paul Spirakis. Weights should be
133 positive values representing the relative weight of the entry in the
134 collection. Counts and similar can be used as weights, it is *not*
135 necessary to normalize to a [0,1] interval. Negative values are not
136 meaningful and given the value zero. Input order is not retained, instead
137 lines are output ordered by the randomized weight that was assigned. This
138 means that a smaller valid sample can be produced by taking the first N
139 lines of output. For more info on the sampling approach see:
140 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling
141 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis
142   (https://arxiv.org/abs/1012.0256)
143 
144 Printing random values: Most of the sampling algorithms work by generating
145 a random value for each line. (See "Compatibility mode" below.) The nature
146 of these values depends on the sampling algorithm. They are used for both
147 line selection and output ordering. The '--p|print-random' option can be
148 used to print these values. The random value is prepended to the line
149 separated by the --d|delimiter char (TAB by default). The
150 '--q|gen-random-inorder' option takes this one step further, generating
151 random values for all input lines without changing the input order. The
152 types of values currently used by these sampling algorithms:
153 * Unweighted sampling: Uniform random value in the interval [0,1]. This
154   includes Bernoulli sampling and unweighted line order randomization.
155 * Weighted sampling: Value in the interval [0,1]. Distribution depends on
156   the values in the weight field. It is used as a partial ordering.
157 * Distinct sampling: An integer, zero and up, representing a selection
158   group. The inclusion probability determines the number of selection groups.
159 * Sampling with replacement: Random value printing is not supported.
160 
161 The specifics behind these random values are subject to change in future
162 releases.
163 
164 Compatibility mode: As described above, many of the sampling algorithms
165 assign a random value to each line. This is useful when printing random
166 values. It has another occasionally useful property: repeated runs with
167 the same static seed but different selection parameters are more
168 compatible with each other, as each line gets assigned the same random
169 value on every run. For example, if Bernoulli sampling is run with
170 '--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed',
171 all the lines selected in the first run will be selected in the second.
172 This comes at a cost: in some cases there are faster algorithms that don't
173 preserve this property. By default, tsv-sample will use faster algorithms
174 when available. However, the '--compatibility-mode' option switches to
175 algorithms that assign a random value per line. Printing random values
176 also engages compatibility mode.
177 
178 Options:
179 EOS";
180 
181 /** Container for command line options.
182  */
183 struct TsvSampleOptions
184 {
185     string programName;                        /// Program name
186     string[] files;                            /// Input files
187     bool helpVerbose = false;                  /// --help-verbose
188     bool hasHeader = false;                    /// --H|header
189     size_t sampleSize = 0;                     /// --n|num - Size of the desired sample
190     double inclusionProbability = double.nan;  /// --p|prob - Inclusion probability
191     size_t[] keyFields;                        /// --k|key-fields - Used with inclusion probability
192     size_t weightField = 0;                    /// --w|weight-field - Field holding the weight
193     bool srsWithReplacement = false;           /// --r|replace
194     bool staticSeed = false;                   /// --s|static-seed
195     uint seedValueOptionArg = 0;               /// --v|seed-value
196     bool printRandom = false;                  /// --print-random
197     bool genRandomInorder = false;             /// --gen-random-inorder
198     string randomValueHeader = "random_value"; /// --random-value-header
199     bool compatibilityMode = false;            /// --compatibility-mode
200     char delim = '\t';                         /// --d|delimiter
201     bool versionWanted = false;                /// --V|version
202     bool preferSkipSampling = false;           /// --prefer-skip-sampling
203     bool preferAlgorithmR = false;             /// --prefer-algorithm-r
204     bool hasWeightField = false;               /// Derived.
205     bool useBernoulliSampling = false;         /// Derived.
206     bool useDistinctSampling = false;          /// Derived.
207     bool distinctKeyIsFullLine = false;        /// Derived. True if '--k|key-fields 0' is specfied.
208     bool usingUnpredictableSeed = true;        /// Derived from --static-seed, --seed-value
209     uint seed = 0;                             /// Derived from --static-seed, --seed-value
210 
211     auto processArgs(ref string[] cmdArgs)
212     {
213         import std.algorithm : any, canFind, each;
214         import std.getopt;
215         import std.math : isNaN;
216         import std.path : baseName, stripExtension;
217         import std.typecons : Yes, No;
218         import tsv_utils.common.utils : makeFieldListOptionHandler;
219 
220         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
221 
222         try
223         {
224             arraySep = ",";    // Use comma to separate values in command line options
225             auto r = getopt(
226                 cmdArgs,
227                 "help-verbose",    "     Print more detailed help.", &helpVerbose,
228 
229                 std.getopt.config.caseSensitive,
230                 "H|header",        "     Treat the first line of each file as a header.", &hasHeader,
231                 std.getopt.config.caseInsensitive,
232 
233                 "n|num",           "NUM  Maximim number of lines to output. All selected lines are output if not provided or zero.", &sampleSize,
234                 "p|prob",          "NUM  Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability,
235 
236                 "k|key-fields",    "<field-list>  Fields to use as key for distinct sampling. Use with '--p|prob'. Specify '--k|key-fields 0' to use the entire line as the key.",
237                 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
238 
239                 "w|weight-field",  "NUM  Field containing weights. All lines get equal weight if not provided or zero.", &weightField,
240                 "r|replace",       "     Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement,
241                 "s|static-seed",   "     Use the same random seed every run.", &staticSeed,
242 
243                 std.getopt.config.caseSensitive,
244                 "v|seed-value",    "NUM  Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg,
245                 std.getopt.config.caseInsensitive,
246 
247                 "print-random",       "     Include the assigned random value (prepended) when writing output lines.", &printRandom,
248                 "gen-random-inorder", "     Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder,
249                 "random-value-header",  "     Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader,
250                 "compatibility-mode", "     Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode,
251 
252                 "d|delimiter",     "CHR  Field delimiter.", &delim,
253 
254                 std.getopt.config.caseSensitive,
255                 "V|version",       "     Print version information and exit.", &versionWanted,
256                 std.getopt.config.caseInsensitive,
257 
258                 "prefer-skip-sampling", "     (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.",
259                 &preferSkipSampling,
260 
261                 "prefer-algorithm-r",   "     (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.",
262                 &preferAlgorithmR,
263                 );
264 
265             if (r.helpWanted)
266             {
267                 defaultGetoptPrinter(helpText, r.options);
268                 return tuple(false, 0);
269             }
270             else if (helpVerbose)
271             {
272                 defaultGetoptPrinter(helpTextVerbose, r.options);
273                 return tuple(false, 0);
274             }
275             else if (versionWanted)
276             {
277                 import tsv_utils.common.tsvutils_version;
278                 writeln(tsvutilsVersionNotice("tsv-sample"));
279                 return tuple(false, 0);
280             }
281 
282             /* Derivations and validations. */
283             if (weightField > 0)
284             {
285                 hasWeightField = true;
286                 weightField--;    // Switch to zero-based indexes.
287             }
288 
289             if (srsWithReplacement)
290             {
291                 if (hasWeightField)
292                 {
293                     throw new Exception("Sampling with replacement (--r|replace) does not support wieghts (--w|weight-field).");
294                 }
295                 else if (!inclusionProbability.isNaN)
296                 {
297                     throw new Exception("Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob).");
298                 }
299                 else if (keyFields.length > 0)
300                 {
301                     throw new Exception("Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields).");
302                 }
303                 else if (printRandom || genRandomInorder)
304                 {
305                     throw new Exception("Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder).");
306                 }
307             }
308 
309             if (keyFields.length > 0)
310             {
311                 /* Note: useDistinctSampling is set as part of the inclusion probability checks below. */
312 
313                 if (inclusionProbability.isNaN) throw new Exception("--p|prob is required when using --k|key-fields.");
314 
315                 if (keyFields.length == 1 && keyFields[0] == 0)
316                 {
317                     distinctKeyIsFullLine = true;
318                 }
319                 else
320                 {
321                     if (keyFields.length > 1 && keyFields.any!(x => x == 0))
322                     {
323                         throw new Exception("Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.");
324                     }
325 
326                     keyFields.each!((ref x) => --x);  // Convert to zero-based indexing.
327                 }
328             }
329 
330             /* Inclusion probability (--p|prob) is used for both Bernoulli sampling and distinct sampling. */
331             if (!inclusionProbability.isNaN)
332             {
333                 if (inclusionProbability <= 0.0 || inclusionProbability > 1.0)
334                 {
335                     import std.format : format;
336                     throw new Exception(
337                         format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability));
338                 }
339 
340                 if (keyFields.length > 0) useDistinctSampling = true;
341                 else useBernoulliSampling = true;
342 
343                 if (hasWeightField) throw new Exception("--w|weight-field and --p|prob cannot be used together.");
344 
345                 if (genRandomInorder && !useDistinctSampling)
346                 {
347                     throw new Exception("--q|gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used.");
348                 }
349             }
350             else if (genRandomInorder && !hasWeightField)
351             {
352                 useBernoulliSampling = true;
353             }
354 
355             if (randomValueHeader.length == 0 || randomValueHeader.canFind('\n') ||
356                 randomValueHeader.canFind(delim))
357             {
358                 throw new Exception("--randomValueHeader must be at least one character and not contain field delimiters or newlines.");
359             }
360 
361             /* Random value printing implies compatibility-mode, otherwise user's selection is used. */
362             if (printRandom || genRandomInorder) compatibilityMode = true;
363 
364             /* Seed. */
365             import std.random : unpredictableSeed;
366 
367             usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0);
368 
369             if (usingUnpredictableSeed) seed = unpredictableSeed;
370             else if (seedValueOptionArg != 0) seed = seedValueOptionArg;
371             else if (staticSeed) seed = 2438424139;
372             else assert(0, "Internal error, invalid seed option states.");
373 
374             /* Assume remaining args are files. Use standard input if files were not provided. */
375             files ~= (cmdArgs.length > 1) ? cmdArgs[1..$] : ["-"];
376             cmdArgs.length = 1;
377         }
378         catch (Exception exc)
379         {
380             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
381             return tuple(false, 1);
382         }
383         return tuple(true, 0);
384     }
385 }
386 /** Invokes the appropriate sampling routine based on the command line arguments.
387  */
388 void tsvSample(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
389 if (isOutputRange!(OutputRange, char))
390 {
391     if (cmdopt.srsWithReplacement)
392     {
393         simpleRandomSamplingWithReplacement(cmdopt, outputStream);
394     }
395     else if (cmdopt.useBernoulliSampling)
396     {
397         bernoulliSamplingCommand(cmdopt, outputStream);
398     }
399     else if (cmdopt.useDistinctSampling)
400     {
401         if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
402         else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream);
403     }
404     else if (cmdopt.genRandomInorder)
405     {
406         /* Note that the preceeding cases handle gen-random-inorder themselves (Bernoulli,
407          * Distinct), or don't handle it (SRS w/ Replacement).
408          */
409         assert(cmdopt.hasWeightField);
410         generateWeightedRandomValuesInorder(cmdopt, outputStream);
411     }
412     else if (cmdopt.sampleSize != 0)
413     {
414         reservoirSamplingCommand(cmdopt, outputStream);
415     }
416     else
417     {
418         randomizeLinesCommand(cmdopt, outputStream);
419     }
420 }
421 
422 /** Invokes the appropriate Bernoulli sampling routine based on the command line arguments.
423  *
424  * This routine selects the appropriate bernoulli sampling function and template
425  * instantiation to use based on the command line arguments.
426  *
427  * See the bernoulliSkipSampling routine for a discussion of the choices behind the
428  * skipSamplingProbabilityThreshold used here.
429  */
430 void bernoulliSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
431 if (isOutputRange!(OutputRange, char))
432 {
433     assert(!cmdopt.hasWeightField);
434 
435     immutable double skipSamplingProbabilityThreshold = 0.04;
436 
437     if (cmdopt.compatibilityMode ||
438         (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling))
439     {
440         if (cmdopt.genRandomInorder)
441         {
442             bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
443         }
444         else
445         {
446             bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream);
447         }
448     }
449     else
450     {
451         bernoulliSkipSampling(cmdopt, outputStream);
452     }
453 }
454 
455 /** Bernoulli sampling of lines on the input stream.
456  *
457  * Each input line is a assigned a random value and output if less than
458  * cmdopt.inclusionProbability. The order of the lines is not changed.
459  *
460  * This routine supports random value printing and gen-random-inorder value printing.
461  */
462 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
463     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
464 if (isOutputRange!(OutputRange, char))
465 {
466     import std.format : formatValue, singleSpec;
467     import std.random : Random = Mt19937, uniform01;
468     import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
469 
470     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
471     else assert(!cmdopt.genRandomInorder);
472 
473     auto randomGenerator = Random(cmdopt.seed);
474     immutable randomValueFormatSpec = singleSpec("%.17g");
475 
476     /* Process each line. */
477     bool headerWritten = false;
478     size_t numLinesWritten = 0;
479     foreach (filename; cmdopt.files)
480     {
481         auto inputStream = (filename == "-") ? stdin : filename.File();
482         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
483         {
484             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
485             if (fileLineNum == 1 && cmdopt.hasHeader)
486             {
487                 if (!headerWritten)
488                 {
489                     static if (generateRandomAll)
490                     {
491                         outputStream.put(cmdopt.randomValueHeader);
492                         outputStream.put(cmdopt.delim);
493                     }
494                     else if (cmdopt.printRandom)
495                     {
496                         outputStream.put(cmdopt.randomValueHeader);
497                         outputStream.put(cmdopt.delim);
498                     }
499 
500                     outputStream.put(line);
501                     outputStream.put("\n");
502                     headerWritten = true;
503                 }
504             }
505             else
506             {
507                 double lineScore = uniform01(randomGenerator);
508 
509                 static if (generateRandomAll)
510                 {
511                     outputStream.formatValue(lineScore, randomValueFormatSpec);
512                     outputStream.put(cmdopt.delim);
513                     outputStream.put(line);
514                     outputStream.put("\n");
515 
516                     if (cmdopt.sampleSize != 0)
517                     {
518                         ++numLinesWritten;
519                         if (numLinesWritten == cmdopt.sampleSize) return;
520                     }
521                 }
522                 else if (lineScore < cmdopt.inclusionProbability)
523                 {
524                     if (cmdopt.printRandom)
525                     {
526                         outputStream.formatValue(lineScore, randomValueFormatSpec);
527                         outputStream.put(cmdopt.delim);
528                     }
529                     outputStream.put(line);
530                     outputStream.put("\n");
531 
532                     if (cmdopt.sampleSize != 0)
533                     {
534                         ++numLinesWritten;
535                         if (numLinesWritten == cmdopt.sampleSize) return;
536                     }
537                 }
538             }
539         }
540     }
541 }
542 
543 /** bernoulliSkipSampling is an implementation of Bernoulli sampling using skips.
544  *
545  * Skip sampling works by skipping a random number of lines between selections. This
546  * can be faster than assigning a random value to each line when the inclusion
547  * probability is low, as it reduces the number of calls to the random number
548  * generator. Both the random number generator and the log() function as called when
549  * calculating the next skip size. These additional log() calls add up as the
550  * probability increases.
551  *
552  * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for
553  * file-oriented line sampling. This is obviously environment specific. In the
554  * environments this implementation has been tested in the perfmance improvements
555  * remain small, less than 7%, even with an inclusion probability as low as 0.0001.
556  *
557  * The algorithm does not assign random values to individual lines. This makes it
558  * incompatible with random value printing. It is not suitable for compatibility mode
559  * either. As an example, in compatibility mode a line selected with '--prob 0.2' should
560  * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling
561  * does not have this property.
562  *
563  * The algorithm for calculating the skip size has been described by multiple sources.
564  * There are two key variants depending on whether the total number of lines in the
565  * data set is known in advance. (This implementation does not know the total.)
566  * Useful references:
567  * $(LIST
568  *     * Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling",
569  *       ACM Trans on Mathematical Software, 1987. On-line:
570  *       http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf
571  *     * P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book
572  *       "Data Stream Management", Springer-Verlag, 2016. On-line:
573  *       https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf
574  *     * Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line:
575  *       http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/
576  * )
577  */
578 void bernoulliSkipSampling(OutputRange)(TsvSampleOptions cmdopt, OutputRange outputStream)
579     if (isOutputRange!(OutputRange, char))
580 {
581     import std.conv : to;
582     import std.math : log, trunc;
583     import std.random : Random = Mt19937, uniform01;
584     import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
585 
586     assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0);
587     assert(!cmdopt.printRandom);
588     assert(!cmdopt.compatibilityMode);
589 
590     auto randomGenerator = Random(cmdopt.seed);
591 
592     immutable double discardRate = 1.0 - cmdopt.inclusionProbability;
593     immutable double logDiscardRate = log(discardRate);
594 
595     /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed
596      * interval to (0.0, 1.0], excluding 0.0.
597      */
598     size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
599 
600     /* Process each line. */
601     bool headerWritten = false;
602     size_t numLinesWritten = 0;
603     foreach (filename; cmdopt.files)
604     {
605         auto inputStream = (filename == "-") ? stdin : filename.File();
606         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
607         {
608             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
609             if (fileLineNum == 1 && cmdopt.hasHeader)
610             {
611                 if (!headerWritten)
612                 {
613                     outputStream.put(line);
614                     outputStream.put("\n");
615                     headerWritten = true;
616                 }
617             }
618             else if (remainingSkips > 0)
619             {
620                 --remainingSkips;
621             }
622             else
623             {
624                 outputStream.put(line);
625                 outputStream.put("\n");
626 
627                 if (cmdopt.sampleSize != 0)
628                 {
629                     ++numLinesWritten;
630                     if (numLinesWritten == cmdopt.sampleSize) return;
631                 }
632 
633                 remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
634             }
635         }
636     }
637 }
638 
639 /** Sample a subset of the unique values from the key fields.
640  *
641  * Distinct sampling is done by hashing the key and mapping the hash value into
642  * buckets matching the inclusion probability. Records having a key mapping to bucket
643  * zero are output.
644  */
645 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
646     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
647 if (isOutputRange!(OutputRange, char))
648 {
649     import std.algorithm : splitter;
650     import std.conv : to;
651     import std.digest.murmurhash;
652     import std.math : lrint;
653     import tsv_utils.common.utils : InputFieldReordering, throwIfWindowsNewlineOnUnix;
654 
655     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
656     else assert(!cmdopt.genRandomInorder);
657 
658     assert(cmdopt.keyFields.length > 0);
659     assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0);
660 
661     static if (generateRandomAll)
662     {
663         import std.format : formatValue, singleSpec;
664         immutable randomValueFormatSpec = singleSpec("%d");
665     }
666 
667     immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys.
668 
669     uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint;
670 
671     /* Create a mapping for the key fields. */
672     auto keyFieldsReordering = cmdopt.distinctKeyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields);
673 
674     /* Process each line. */
675     bool headerWritten = false;
676     size_t numLinesWritten = 0;
677     foreach (filename; cmdopt.files)
678     {
679         auto inputStream = (filename == "-") ? stdin : filename.File();
680         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
681         {
682             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
683             if (fileLineNum == 1 && cmdopt.hasHeader)
684             {
685                 if (!headerWritten)
686                 {
687                     static if (generateRandomAll)
688                     {
689                         outputStream.put(cmdopt.randomValueHeader);
690                         outputStream.put(cmdopt.delim);
691                     }
692                     else if (cmdopt.printRandom)
693                     {
694                         outputStream.put(cmdopt.randomValueHeader);
695                         outputStream.put(cmdopt.delim);
696                     }
697 
698                     outputStream.put(line);
699                     outputStream.put("\n");
700                     headerWritten = true;
701                 }
702             }
703             else
704             {
705                 /* Murmurhash works by successively adding individual keys, then finalizing.
706                  * Adding individual keys is simpler if the full-line-as-key and individual
707                  * fields as keys cases are separated.
708                  */
709                 auto hasher = MurmurHash3!32(cmdopt.seed);
710 
711                 if (cmdopt.distinctKeyIsFullLine)
712                 {
713                     hasher.put(cast(ubyte[]) line);
714                 }
715                 else
716                 {
717                     assert(keyFieldsReordering !is null);
718 
719                     /* Gather the key field values and assemble the key. */
720                     keyFieldsReordering.initNewLine;
721                     foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
722                     {
723                         keyFieldsReordering.processNextField(fieldIndex, fieldValue);
724                         if (keyFieldsReordering.allFieldsFilled) break;
725                     }
726 
727                     if (!keyFieldsReordering.allFieldsFilled)
728                     {
729                         import std.format : format;
730                         throw new Exception(
731                             format("Not enough fields in line. File: %s, Line: %s",
732                                    (filename == "-") ? "Standard Input" : filename, fileLineNum));
733                     }
734 
735                     foreach (count, key; keyFieldsReordering.outputFields.enumerate)
736                     {
737                         if (count > 0) hasher.put(delimArray);
738                         hasher.put(cast(ubyte[]) key);
739                     }
740                 }
741 
742                 hasher.finish;
743 
744                 static if (generateRandomAll)
745                 {
746                     import std.conv : to;
747                     outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec);
748                     outputStream.put(cmdopt.delim);
749                     outputStream.put(line);
750                     outputStream.put("\n");
751 
752                     if (cmdopt.sampleSize != 0)
753                     {
754                         ++numLinesWritten;
755                         if (numLinesWritten == cmdopt.sampleSize) return;
756                     }
757                 }
758                 else if (hasher.get % numBuckets == 0)
759                 {
760                     if (cmdopt.printRandom)
761                     {
762                         outputStream.put('0');
763                         outputStream.put(cmdopt.delim);
764                     }
765                     outputStream.put(line);
766                     outputStream.put("\n");
767 
768                     if (cmdopt.sampleSize != 0)
769                     {
770                         ++numLinesWritten;
771                         if (numLinesWritten == cmdopt.sampleSize) return;
772                     }
773                 }
774             }
775         }
776     }
777 }
778 
779 /** Invokes the appropriate reservoir sampling routine based on the command line
780  * arguments.
781  *
782  * This routine selects the appropriate reservior sampling function and template
783  * instantiation to use based on the command line arguments.
784  *
785  * Reservoir sampling is used when a fixed size sample is being pulled from an input
786  * stream. Weighted and unweighted sampling is supported. These routines also
787  * randomize the order of the selected lines. This is consistent with line order
788  * randomization of the entire input stream (handled by randomizeLinesCommand).
789  *
790  * For unweighted sampling, there is a performance tradeoff choice between the two
791  * available implementations. See the reservoirSampling documentation for
792  * information. The threshold used here was chosen based on performance tests.
793  */
794 
795 void reservoirSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
796 if (isOutputRange!(OutputRange, char))
797 {
798     assert(cmdopt.sampleSize != 0);
799 
800     immutable size_t algorithmRSampleSizeThreshold = 128 * 1024;
801 
802     if (cmdopt.hasWeightField)
803     {
804         reservoirSamplingViaHeap!(Yes.isWeighted)(cmdopt, outputStream);
805     }
806     else if (cmdopt.compatibilityMode ||
807              (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR))
808     {
809         reservoirSamplingViaHeap!(No.isWeighted)(cmdopt, outputStream);
810     }
811     else
812     {
813         reservoirSamplingAlgorithmR(cmdopt, outputStream);
814     }
815 }
816 
817 /** Reservior sampling using a heap. Both weighted and unweighted random sampling are
818  * supported.
819  *
820  * The algorithm used here is based on the one-pass algorithm described by Pavlos
821  * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S.
822  * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are
823  * simply set to one.
824  *
825  * The implementation uses a heap (priority queue) large enough to hold the desired
826  * number of lines. Input is read line-by-line, assigned a random value, and added to
827  * the heap. The role of the identify the lines with the highest assigned random
828  * values. Once the heap is full, adding a new line means dropping the line with the
829  * lowest score. A "min" heap used for this reason.
830  *
831  * When done reading all lines, the "min" heap is in the opposite order needed for
832  * output. The desired order is obtained by removing each element one at at time from
833  * the heap. The underlying data store will have the elements in correct order.
834  *
835  * Generating output in weighted order matters for several reasons:
836  *  - For weighted sampling, it preserves the property that smaller valid subsets can be
837  *    created by taking the first N lines.
838  *  - For unweighted sampling, it ensures that all output permutations are possible, and
839  *    are not influences by input order or the heap data structure used.
840  *  - Order consistency when making repeated use of the same random seeds, but with
841  *    different sample sizes.
842  *
843  * There are use cases where only the selection set matters, for these some performance
844  * could be gained by skipping the reordering and simply printing the backing store
845  * array in-order, but making this distinction seems an unnecessary complication.
846  *
847  * Notes:
848  * $(LIST
849  *    * In tsv-sample versions 1.2.1 and earlier this routine also supported
850  *      randomization of all input lines. This was dropped in version 1.2.2 in favor
851  *      of the approach used in randomizeLines. The latter has significant advantages
852  *      given that all data data must be read into memory.
853  *    * For larger reservoir sizes better performance can be achieved by using
854  *      reservoirSamplingAlgorithmR. See the documentation of that function for details.
855  * )
856  */
857 void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, OutputRange)
858     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
859 if (isOutputRange!(OutputRange, char))
860 {
861     import std.container.array;
862     import std.container.binaryheap;
863     import std.format : formatValue, singleSpec;
864     import std.random : Random = Mt19937, uniform01;
865     import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
866 
867     static if (isWeighted) assert(cmdopt.hasWeightField);
868     else assert(!cmdopt.hasWeightField);
869 
870     assert(cmdopt.sampleSize > 0);
871 
872     auto randomGenerator = Random(cmdopt.seed);
873 
874     struct Entry
875     {
876         double score;
877         char[] line;
878     }
879 
880     /* Create the heap and backing data store.
881      *
882      * Note: An std.container.array is used as the backing store to avoid some issues in
883      * the standard library (Phobos) binaryheap implementation. Specifically, when an
884      * std.container.array is used as backing store, the heap can efficiently reversed by
885      * removing the heap elements. This leaves the backing store in the reversed order.
886      * However, the current binaryheap implementation does not support this for all
887      * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094.
888      */
889 
890     Array!Entry dataStore;
891     dataStore.reserve(cmdopt.sampleSize);
892     auto reservoir = dataStore.heapify!("a.score > b.score")(0);  // Min binaryheap
893 
894     /* Process each line. */
895     bool headerWritten = false;
896     foreach (filename; cmdopt.files)
897     {
898         auto inputStream = (filename == "-") ? stdin : filename.File();
899         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
900         {
901             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
902             if (fileLineNum == 1 && cmdopt.hasHeader)
903             {
904                 if (!headerWritten)
905                 {
906                     if (cmdopt.printRandom)
907                     {
908                         outputStream.put(cmdopt.randomValueHeader);
909                         outputStream.put(cmdopt.delim);
910                     }
911                     outputStream.put(line);
912                     outputStream.put("\n");
913                     headerWritten = true;
914                 }
915             }
916             else
917             {
918                 static if (!isWeighted)
919                 {
920                     double lineScore = uniform01(randomGenerator);
921                 }
922                 else
923                 {
924                     double lineWeight =
925                         getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, filename, fileLineNum);
926                     double lineScore =
927                         (lineWeight > 0.0)
928                         ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
929                         : 0.0;
930                 }
931 
932                 if (reservoir.length < cmdopt.sampleSize)
933                 {
934                     reservoir.insert(Entry(lineScore, line.dup));
935                 }
936                 else if (reservoir.front.score < lineScore)
937                 {
938                     reservoir.replaceFront(Entry(lineScore, line.dup));
939                 }
940             }
941         }
942     }
943 
944     /* All entries are in the reservoir. Time to print. The heap is in reverse order
945      * of assigned weights. Reversing order is done by removing all elements from the
946      * heap, this leaves the backing store in the correct order for output.
947      *
948      * The asserts here avoid issues with the current binaryheap implementation. They
949      * detect use of backing stores having a length not synchronized to the reservoir.
950      */
951     size_t numLines = reservoir.length;
952     assert(numLines == dataStore.length);
953 
954     while (!reservoir.empty) reservoir.removeFront;
955     assert(numLines == dataStore.length);
956 
957     immutable randomValueFormatSpec = singleSpec("%.17g");
958 
959     foreach (entry; dataStore)
960     {
961         if (cmdopt.printRandom)
962         {
963             outputStream.formatValue(entry.score, randomValueFormatSpec);
964             outputStream.put(cmdopt.delim);
965         }
966         outputStream.put(entry.line);
967         outputStream.put("\n");
968     }
969  }
970 
971 /** Generates weighted random values for all input lines, preserving input order.
972  *
973  * This complements weighted reservoir sampling, but instead of using a reservoir it
974  * simply iterates over the input lines generating the values. The weighted random
975  * values are generated with the same formula used by reservoirSampling.
976  */
977 void generateWeightedRandomValuesInorder(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
978 if (isOutputRange!(OutputRange, char))
979 {
980     import std.format : formatValue, singleSpec;
981     import std.random : Random = Mt19937, uniform01;
982     import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
983 
984     assert(cmdopt.hasWeightField);
985 
986     auto randomGenerator = Random(cmdopt.seed);
987     immutable randomValueFormatSpec = singleSpec("%.17g");
988 
989     /* Process each line. */
990     bool headerWritten = false;
991     size_t numLinesWritten = 0;
992     foreach (filename; cmdopt.files)
993     {
994         auto inputStream = (filename == "-") ? stdin : filename.File();
995         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
996         {
997             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
998             if (fileLineNum == 1 && cmdopt.hasHeader)
999             {
1000                 if (!headerWritten)
1001                 {
1002                     outputStream.put(cmdopt.randomValueHeader);
1003                     outputStream.put(cmdopt.delim);
1004                     outputStream.put(line);
1005                     outputStream.put("\n");
1006                     headerWritten = true;
1007                 }
1008             }
1009             else
1010                {
1011                 double lineWeight = getFieldValue!double(line, cmdopt.weightField, cmdopt.delim,
1012                                                          filename, fileLineNum);
1013                 double lineScore =
1014                     (lineWeight > 0.0)
1015                     ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1016                     : 0.0;
1017 
1018                 outputStream.formatValue(lineScore, randomValueFormatSpec);
1019                 outputStream.put(cmdopt.delim);
1020                 outputStream.put(line);
1021                 outputStream.put("\n");
1022 
1023                 if (cmdopt.sampleSize != 0)
1024                 {
1025                     ++numLinesWritten;
1026                     if (numLinesWritten == cmdopt.sampleSize) return;
1027                 }
1028             }
1029         }
1030     }
1031 }
1032 
1033 /** Reservoir sampling via Algorithm R
1034  *
1035  * This is an implementation of reservoir sampling using what is commonly known as
1036  * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of
1037  * Computer Programming, Volume 2: Seminumerical Algorithms". More information about
1038  * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with
1039  * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling"
1040  * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R).
1041  *
1042  * Algorithm R is used for unweighted sampling without replacement. The heap-based
1043  * algorithm in reservoirSamplingViaHeap is used for weighted sampling.
1044  *
1045  * The classic algorithm stops after identifying the selected set of items. This
1046  * implementation goes one step further and randomizes the order of the selected
1047  * lines. This supports the tsv-sample use-case, which is line order randomization.
1048  *
1049  * This algorithm is faster than reservoirSamplingViaHeap when the sample size
1050  * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size.
1051  * Insertion in this algorithm is O(1). Similarly, generating the random order in the
1052  * heap is O(k * log k), while in this algorithm the final randomization step is O(k).
1053  *
1054  * This speed advantage may be offset a certain amount by using a more expensive random
1055  * value generator. reservoirSamplingViaHeap generates values between zero and one,
1056  * whereas reservoirSamplingAlgorithR generates random integers over and ever growing
1057  * interval. The latter is expected to be more expensive. This is consistent with
1058  * performance test indicating that reservoirSamplingViaHeap is faster when using
1059  * small-to-medium size reservoirs and large input streams.
1060  */
1061 void reservoirSamplingAlgorithmR(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1062 if (isOutputRange!(OutputRange, char))
1063 {
1064     import std.random : Random = Mt19937, randomShuffle, uniform;
1065     import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
1066 
1067     assert(cmdopt.sampleSize > 0);
1068     assert(!cmdopt.hasWeightField);
1069     assert(!cmdopt.compatibilityMode);
1070     assert(!cmdopt.printRandom);
1071     assert(!cmdopt.genRandomInorder);
1072 
1073     string[] reservoir;
1074     auto reservoirAppender = appender(&reservoir);
1075     reservoirAppender.reserve(cmdopt.sampleSize);
1076 
1077     auto randomGenerator = Random(cmdopt.seed);
1078 
1079     /* Process each line. */
1080 
1081     bool headerWritten = false;
1082     size_t totalLineNum = 0;
1083     foreach (filename; cmdopt.files)
1084     {
1085         auto inputStream = (filename == "-") ? stdin : filename.File();
1086         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
1087         {
1088             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
1089             if (fileLineNum == 1 && cmdopt.hasHeader)
1090             {
1091                 if (!headerWritten)
1092                 {
1093                     outputStream.put(line);
1094                     outputStream.put("\n");
1095                     headerWritten = true;
1096                 }
1097             }
1098             else
1099             {
1100                 /* Add lines to the reservoir until the reservoir is filled.
1101                  * After that lines are added with decreasing likelihood, based on
1102                  * the total number of lines seen. If added to the reservoir, the
1103                  * line replaces a randomly chosen existing line.
1104                  */
1105                 if (totalLineNum < cmdopt.sampleSize)
1106                 {
1107                     reservoirAppender ~= line.idup;
1108                 }
1109                 else
1110                 {
1111                     size_t i = uniform(0, totalLineNum, randomGenerator);
1112                     if (i < reservoir.length) reservoir[i] = line.idup;
1113                 }
1114 
1115                 ++totalLineNum;
1116             }
1117         }
1118     }
1119 
1120     /* The random sample is now in the reservior. Shuffle it and print. */
1121 
1122     reservoir.randomShuffle(randomGenerator);
1123 
1124     foreach (ref line; reservoir)
1125     {
1126         outputStream.put(line);
1127         outputStream.put("\n");
1128     }
1129 }
1130 
1131 /** Invokes the appropriate routine to randomize input lines based on the command line
1132  * arguments.
1133  *
1134  * This routine selects the appropriate randomize lines function and template instantiation
1135  * to use based on the command line arguments.
1136  */
1137 void randomizeLinesCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1138 if (isOutputRange!(OutputRange, char))
1139 {
1140     if (cmdopt.hasWeightField)
1141     {
1142         randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream);
1143     }
1144     else if (cmdopt.compatibilityMode)
1145     {
1146         randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream);
1147     }
1148     else
1149     {
1150         randomizeLinesViaShuffle(cmdopt, outputStream);
1151     }
1152 }
1153 
1154 /** Randomize all the lines in files or standard input using assigned random weights
1155  * and sorting.
1156  *
1157  * All lines in files and/or standard input are read in and written out in random
1158  * order. This algorithm assigns a random value to each line and sorts. This approach
1159  * supports both weighted sampling and simple random sampling (unweighted).
1160  *
1161  * This is significantly faster than heap-based reservoir sampling in the case where
1162  * the entire file is being read. See also randomizeLinesViaShuffle for the unweighted
1163  * case, as it is a little faster, at the cost not supporting random value printing or
1164  * compatibility-mode.
1165  *
1166  * Input data size is limited by available memory. Disk oriented techniques are needed
1167  * when data sizes are larger. For example, generating random values line-by-line (ala
1168  * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort.
1169  */
1170 void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1171 if (isOutputRange!(OutputRange, char))
1172 {
1173     import std.algorithm : map, sort;
1174     import std.format : formatValue, singleSpec;
1175 
1176     static if (isWeighted) assert(cmdopt.hasWeightField);
1177     else assert(!cmdopt.hasWeightField);
1178 
1179     assert(cmdopt.sampleSize == 0);
1180 
1181     /*
1182      * Read all file data into memory. Then split the data into lines and assign a
1183      * random value to each line. identifyFileLines also writes the first header line.
1184      */
1185     auto fileData = cmdopt.files.map!FileData.array;
1186     auto inputLines = fileData.identifyFileLines!(Yes.hasRandomValue, isWeighted)(cmdopt, outputStream);
1187 
1188     /*
1189      * Sort by the weight and output the lines.
1190      */
1191     inputLines.sort!((a, b) => a.randomValue > b.randomValue);
1192 
1193     immutable randomValueFormatSpec = singleSpec("%.17g");
1194 
1195     foreach (lineEntry; inputLines)
1196     {
1197         if (cmdopt.printRandom)
1198         {
1199             outputStream.formatValue(lineEntry.randomValue, randomValueFormatSpec);
1200             outputStream.put(cmdopt.delim);
1201         }
1202         outputStream.put(lineEntry.data);
1203         outputStream.put("\n");
1204     }
1205 }
1206 
1207 /** Randomize all the lines in files or standard input using a shuffling algorithm.
1208  *
1209  * All lines in files and/or standard input are read in and written out in random
1210  * order. This routine uses array shuffling, which is faster than sorting. This makes
1211  * this routine a good alternative to randomizeLinesViaSort when doing unweighted
1212  * randomization.
1213  *
1214  * Input data size is limited by available memory. Disk oriented techniques are needed
1215  * when data sizes are larger. For example, generating random values line-by-line (ala
1216  * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort.
1217  *
1218  * This routine does not support random value printing or compatibility-mode.
1219  */
1220 void randomizeLinesViaShuffle(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1221 if (isOutputRange!(OutputRange, char))
1222 {
1223     import std.algorithm : map;
1224     import std.random : Random = Mt19937, randomShuffle;
1225 
1226     assert(cmdopt.sampleSize == 0);
1227     assert(!cmdopt.hasWeightField);
1228     assert(!cmdopt.printRandom);
1229     assert(!cmdopt.genRandomInorder);
1230 
1231     /*
1232      * Read all file data into memory and split into lines.
1233      */
1234     auto fileData = cmdopt.files.map!FileData.array;
1235     auto inputLines = fileData.identifyFileLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream);
1236 
1237     /*
1238      * Randomly shuffle and print each line.
1239      *
1240      * Note: Also tried randomCover, but that was exceedingly slow.
1241      */
1242     import std.random : randomShuffle;
1243 
1244     auto randomGenerator = Random(cmdopt.seed);
1245     inputLines.randomShuffle(randomGenerator);
1246 
1247     foreach (ref line; inputLines)
1248     {
1249         outputStream.put(line.data);
1250         outputStream.put("\n");
1251     }
1252 }
1253 
1254 /** Simple random sampling with replacement.
1255  *
1256  * All lines in files and/or standard input are read in. Then random lines are selected
1257  * one at a time and output. Lines can be selected multiple times. This process continues
1258  * until the desired number of samples (--n|num) has been output. Output continues
1259  * indefinitely if a sample size was not provided.
1260  */
1261 void simpleRandomSamplingWithReplacement(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1262 if (isOutputRange!(OutputRange, char))
1263 {
1264     import std.algorithm : map;
1265     import std.format : formatValue, singleSpec;
1266     import std.random : Random = Mt19937, uniform;
1267 
1268     /*
1269      * Read all file data into memory and split the data into lines.
1270      */
1271     auto fileData = cmdopt.files.map!FileData.array;
1272     auto inputLines = fileData.identifyFileLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream);
1273 
1274     if (inputLines.length > 0)
1275     {
1276         auto randomGenerator = Random(cmdopt.seed);
1277 
1278         /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */
1279         size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize;
1280         while (numLeft != 0)
1281         {
1282             size_t index = uniform(0, inputLines.length, randomGenerator);
1283             outputStream.put(inputLines[index].data);
1284             outputStream.put("\n");
1285             if (cmdopt.sampleSize != 0) numLeft--;
1286         }
1287     }
1288 }
1289 
1290 /** A container and reader data form a file or standard input.
1291  *
1292  * The FileData struct is used to read data from a file or standard input. It is used
1293  * by passing a filename to the constructor. The constructor reads the file data.
1294  * If the filename is a single hyphen ('-') then data is read from standard input.
1295  *
1296  * The struct make the data available through two members: 'filename', which is the
1297  * filename, and 'data', which is a character array of the data.
1298  */
1299 struct FileData
1300 {
1301     string filename;
1302     char[] data;
1303 
1304     this(string fname)
1305     {
1306         import std.algorithm : min;
1307         import std.array : appender;
1308 
1309         filename = fname;
1310 
1311         ubyte[1024 * 128] fileRawBuf;
1312         auto dataAppender = appender(&data);
1313         auto ifile = (filename == "-") ? stdin : filename.File;
1314 
1315         if (filename != "-")
1316         {
1317             ulong filesize = ifile.size;
1318             if (filesize < ulong.max) dataAppender.reserve(min(filesize, size_t.max));
1319         }
1320 
1321         foreach (ref ubyte[] buffer; ifile.byChunk(fileRawBuf)) dataAppender.put(cast(char[]) buffer);
1322     }
1323 }
1324 
1325 /** HasRandomValue is a boolean flag used at compile time by identifyFileLines to
1326  * distinguish use cases needing random value assignments from those that don't.
1327  */
1328 alias HasRandomValue = Flag!"hasRandomValue";
1329 
1330 /** An InputLine array is returned by identifyFileLines to represent each non-header line
1331  * line found in a FileData array. The 'data' element contains the line. A 'randomValue'
1332  * line is included if random values are being generated.
1333  */
1334 struct InputLine(HasRandomValue hasRandomValue)
1335 {
1336     char[] data;
1337     static if (hasRandomValue) double randomValue;
1338 }
1339 
1340 /** identifyFileLines is used by algorithms that read all files into memory prior to
1341  * processing. It does the initial processing of the file data.
1342  *
1343  * Three primary tasks are performed. One is splitting all input data into lines. The
1344  * second is writting the header line from the first file to the output stream. Header
1345  * lines from subsequent files are ignored. Third is assigning a random value to the
1346  * line, if random values are being generated.
1347  *
1348  * The key input is a FileData array, one element for each file. The FileData reads
1349  * the file when instantiated.
1350  *
1351  * The return value is an array of InputLine structs. The struct will have a 'randomValue'
1352  * member if random values are being assigned.
1353  */
1354 InputLine!hasRandomValue[] identifyFileLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted, OutputRange)
1355 (ref FileData[] fileData, TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1356 if (isOutputRange!(OutputRange, char))
1357 {
1358     import std.algorithm : splitter;
1359     import std.array : appender;
1360     import std.random : Random = Mt19937, uniform01;
1361     import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
1362 
1363     static assert(hasRandomValue || !isWeighted);
1364     static if(!hasRandomValue) assert(!cmdopt.printRandom);
1365 
1366     InputLine!hasRandomValue[] inputLines;
1367 
1368     auto linesAppender = appender(&inputLines);
1369     static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed);
1370     bool headerWritten = false;
1371 
1372     foreach (fd; fileData)
1373     {
1374         /* Drop the last newline to avoid adding an extra empty line. */
1375         auto data = (fd.data.length > 0 && fd.data[$ - 1] == '\n') ? fd.data[0 .. $ - 1] : fd.data;
1376         foreach (fileLineNum, ref line; data.splitter('\n').enumerate(1))
1377         {
1378             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, fd.filename, fileLineNum);
1379             if (fileLineNum == 1 && cmdopt.hasHeader)
1380             {
1381                 if (!headerWritten)
1382                 {
1383                     if (cmdopt.printRandom)
1384                     {
1385                         outputStream.put(cmdopt.randomValueHeader);
1386                         outputStream.put(cmdopt.delim);
1387                     }
1388                     outputStream.put(line);
1389                     outputStream.put("\n");
1390                     headerWritten = true;
1391                 }
1392             }
1393             else
1394             {
1395                 static if (!hasRandomValue)
1396                 {
1397                     linesAppender.put(InputLine!hasRandomValue(line));
1398                 }
1399                 else
1400                 {
1401                     static if (!isWeighted)
1402                     {
1403                         double randomValue = uniform01(randomGenerator);
1404                     }
1405                     else
1406                     {
1407                         double lineWeight =
1408                             getFieldValue!double(line, cmdopt.weightField, cmdopt.delim,
1409                                                  fd.filename, fileLineNum);
1410                         double randomValue =
1411                             (lineWeight > 0.0)
1412                             ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1413                             : 0.0;
1414                     }
1415 
1416                     linesAppender.put(InputLine!hasRandomValue(line, randomValue));
1417                 }
1418             }
1419         }
1420     }
1421 
1422     return inputLines;
1423 }
1424 
1425 
1426 /** Convenience function for extracting a single field from a line. See
1427  * [tsv_utils.common.utils.getTsvFieldValue] for details. This wrapper creates error
1428  * text tailored for this program.
1429  */
1430 import std.traits : isSomeChar;
1431 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, size_t lineNum) pure @safe
1432 if (isSomeChar!C)
1433 {
1434     import std.conv : ConvException, to;
1435     import std.format : format;
1436     import tsv_utils.common.utils : getTsvFieldValue;
1437 
1438     T val;
1439     try
1440     {
1441         val = getTsvFieldValue!T(line, fieldIndex, delim);
1442     }
1443     catch (ConvException exc)
1444     {
1445         throw new Exception(
1446             format("Could not process line: %s\n  File: %s Line: %s%s",
1447                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
1448                    (lineNum == 1) ? "\n  Is this a header line? Use --H|header to skip." : ""));
1449     }
1450     catch (Exception exc)
1451     {
1452         /* Not enough fields on the line. */
1453         throw new Exception(
1454             format("Could not process line: %s\n  File: %s Line: %s",
1455                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum));
1456     }
1457 
1458     return val;
1459 }
1460 
1461 unittest
1462 {
1463     /* getFieldValue unit tests. getTsvFieldValue has it's own tests.
1464      * These tests make basic sanity checks on the getFieldValue wrapper.
1465      */
1466     import std.exception;
1467 
1468     assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123);
1469     assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4);
1470     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1));
1471     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2));
1472     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1));
1473     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2));
1474 }
1475 
1476 /* Unit tests for the main program start here.
1477  *
1478  * Portability note: Many of the tests here rely on generating consistent random numbers
1479  * across different platforms when using the same random seed. So far this has succeeded
1480  * on several different platorm, compiler, and library versions. However, it is certainly
1481  * possible this condition will not hold on other platforms.
1482  *
1483  * For tsv-sample, this portability implies generating the same results on different
1484  * platforms when using the same random seed. This is NOT part of tsv-sample guarantees,
1485  * but it is convenient for testing. If platforms are identified that do not generate
1486  * the same results these tests will need to be adjusted.
1487  */
1488 version(unittest)
1489 {
1490     /* Unit test helper functions. */
1491 
1492     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
1493     import std.conv : to;
1494 
1495     void testTsvSample(string[] cmdArgs, string[][] expected)
1496     {
1497         import std.array : appender;
1498         import std.format : format;
1499 
1500         assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty.");
1501 
1502         auto formatAssertMessage(T...)(string msg, T formatArgs)
1503         {
1504             auto formatString = "[testTsvSample] %s: " ~ msg;
1505             return format(formatString, cmdArgs[0], formatArgs);
1506         }
1507 
1508         TsvSampleOptions cmdopt;
1509         auto savedCmdArgs = cmdArgs.to!string;
1510         auto r = cmdopt.processArgs(cmdArgs);
1511         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
1512         auto output = appender!(char[])();
1513 
1514         tsvSample(cmdopt, output);    // This invokes the main code line.
1515 
1516         auto expectedOutput = expected.tsvDataToString;
1517 
1518         assert(output.data == expectedOutput,
1519                formatAssertMessage(
1520                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
1521                    expectedOutput.to!string, output.data.to!string));
1522     }
1523  }
1524 
1525 unittest
1526 {
1527     import std.path : buildPath;
1528     import std.file : rmdirRecurse;
1529     import std.format : format;
1530 
1531     auto testDir = makeUnittestTempDir("tsv_sample");
1532     scope(exit) testDir.rmdirRecurse;
1533 
1534     /* Tabular data sets and expected results use the built-in static seed.
1535      * Tests are run by writing the data set to a file, then calling the main
1536      * routine to process. The function testTsvSample plays the role of the
1537      * main program. Rather than writing to expected output, the results are
1538      * matched against expected. The expected results were verified by hand
1539      * prior to inclusion in the test.
1540      *
1541      * The initial part of this section is simply setting up data files and
1542      * expected results.
1543      *
1544      * Expected results naming conventions:
1545      *  - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected
1546      *  - Sampling Type (required): Permute, Replace, Bernoulli, Distinct
1547      *  - Compatibility: Compat, AlgoR, Skip, Swap
1548      *  - Weight Field: Wt<num>, e.g. Wt3
1549      *  - Sample Size: Num<num>, eg. Num3
1550      *  - Seed Value: V<num>, eg. V77
1551      *  - Key Field: K<num>, e.g. K2
1552      *  - Probability: P<num>, e.g P05 (5%)
1553      *  - Printing Probalities: Probs
1554      *  - Printing Probs in order: ProbsInorder
1555      *  - Printing Probs with custom header: RVCustom
1556      */
1557 
1558     /* Empty file. */
1559     string[][] dataEmpty = [];
1560     string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv");
1561     writeUnittestTsvFile(fpath_dataEmpty, dataEmpty);
1562 
1563     /* 3x1, header only. */
1564     string[][] data3x0 = [["field_a", "field_b", "field_c"]];
1565     string fpath_data3x0 = buildPath(testDir, "data3x0.tsv");
1566     writeUnittestTsvFile(fpath_data3x0, data3x0);
1567 
1568     /* 3x1 */
1569     string[][] data3x1 =
1570         [["field_a", "field_b", "field_c"],
1571          ["tan", "タン", "8.5"]];
1572 
1573     string fpath_data3x1 = buildPath(testDir, "data3x1.tsv");
1574     string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv");
1575     writeUnittestTsvFile(fpath_data3x1, data3x1);
1576     writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1..$]);
1577 
1578     string[][] data3x1ExpectedReplaceNum3 =
1579         [["field_a", "field_b", "field_c"],
1580          ["tan", "タン", "8.5"],
1581          ["tan", "タン", "8.5"],
1582          ["tan", "タン", "8.5"]];
1583 
1584     /* 3x2 */
1585     string[][] data3x2 =
1586         [["field_a", "field_b", "field_c"],
1587          ["brown", "褐色", "29.2"],
1588          ["gray", "グレー", "6.2"]];
1589 
1590     string fpath_data3x2 = buildPath(testDir, "data3x2.tsv");
1591     string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv");
1592     writeUnittestTsvFile(fpath_data3x2, data3x2);
1593     writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1..$]);
1594 
1595     string[][] data3x2PermuteCompat =
1596         [["field_a", "field_b", "field_c"],
1597          ["gray", "グレー", "6.2"],
1598          ["brown", "褐色", "29.2"]];
1599 
1600     string[][] data3x2PermuteShuffle =
1601         [["field_a", "field_b", "field_c"],
1602          ["gray", "グレー", "6.2"],
1603          ["brown", "褐色", "29.2"]];
1604 
1605     /* 3x3 */
1606     string[][] data3x3 =
1607         [["field_a", "field_b", "field_c"],
1608          ["orange", "オレンジ", "2.5"],
1609          ["pink", "ピンク", "1.1"],
1610          ["purple", "紫の", "42"]];
1611 
1612     string fpath_data3x3 = buildPath(testDir, "data3x3.tsv");
1613     string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv");
1614     writeUnittestTsvFile(fpath_data3x3, data3x3);
1615     writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1..$]);
1616 
1617     string[][] data3x3ExpectedPermuteCompat =
1618         [["field_a", "field_b", "field_c"],
1619          ["purple", "紫の", "42"],
1620          ["pink", "ピンク", "1.1"],
1621          ["orange", "オレンジ", "2.5"]];
1622 
1623     string[][] data3x3ExpectedPermuteSwap =
1624         [["field_a", "field_b", "field_c"],
1625          ["purple", "紫の", "42"],
1626          ["orange", "オレンジ", "2.5"],
1627          ["pink", "ピンク", "1.1"]];
1628 
1629     /* 3x6 */
1630     string[][] data3x6 =
1631         [["field_a", "field_b", "field_c"],
1632          ["red", "赤", "23.8"],
1633          ["green", "緑", "0.0072"],
1634          ["white", "白", "1.65"],
1635          ["yellow", "黄", "12"],
1636          ["blue", "青", "12"],
1637          ["black", "黒", "0.983"]];
1638     string fpath_data3x6 = buildPath(testDir, "data3x6.tsv");
1639     string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv");
1640     writeUnittestTsvFile(fpath_data3x6, data3x6);
1641     writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1..$]);
1642 
1643     // Randomization, all lines
1644     string[][] data3x6ExpectedPermuteCompat =
1645         [["field_a", "field_b", "field_c"],
1646          ["yellow", "黄", "12"],
1647          ["black", "黒", "0.983"],
1648          ["blue", "青", "12"],
1649          ["white", "白", "1.65"],
1650          ["green", "緑", "0.0072"],
1651          ["red", "赤", "23.8"]];
1652 
1653     string[][] data3x6ExpectedPermuteSwap =
1654         [["field_a", "field_b", "field_c"],
1655          ["black", "黒", "0.983"],
1656          ["green", "緑", "0.0072"],
1657          ["red", "赤", "23.8"],
1658          ["yellow", "黄", "12"],
1659          ["white", "白", "1.65"],
1660          ["blue", "青", "12"]];
1661 
1662     string[][] data3x6ExpectedPermuteCompatProbs =
1663         [["random_value", "field_a", "field_b", "field_c"],
1664          ["0.96055546286515892", "yellow", "黄", "12"],
1665          ["0.7571015392895788", "black", "黒", "0.983"],
1666          ["0.52525980887003243", "blue", "青", "12"],
1667          ["0.49287854949943721", "white", "白", "1.65"],
1668          ["0.15929344086907804", "green", "緑", "0.0072"],
1669          ["0.010968807619065046", "red", "赤", "23.8"]];
1670 
1671     /* Note: data3x6ExpectedAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because
1672      * both are effectively the same algorithm given that --num is data length. Both read
1673      * in the full data in order then call randomShuffle.
1674      */
1675     string[][] data3x6ExpectedPermuteAlgoRNum6 =
1676         [["field_a", "field_b", "field_c"],
1677          ["black", "黒", "0.983"],
1678          ["green", "緑", "0.0072"],
1679          ["red", "赤", "23.8"],
1680          ["yellow", "黄", "12"],
1681          ["white", "白", "1.65"],
1682          ["blue", "青", "12"]];
1683 
1684     string[][] data3x6ExpectedPermuteAlgoRNum5 =
1685         [["field_a", "field_b", "field_c"],
1686          ["red", "赤", "23.8"],
1687          ["black", "黒", "0.983"],
1688          ["white", "白", "1.65"],
1689          ["green", "緑", "0.0072"],
1690          ["yellow", "黄", "12"]];
1691 
1692     string[][] data3x6ExpectedPermuteAlgoRNum4 =
1693         [["field_a", "field_b", "field_c"],
1694          ["blue", "青", "12"],
1695          ["green", "緑", "0.0072"],
1696          ["black", "黒", "0.983"],
1697          ["white", "白", "1.65"]];
1698 
1699     string[][] data3x6ExpectedPermuteAlgoRNum3 =
1700         [["field_a", "field_b", "field_c"],
1701          ["red", "赤", "23.8"],
1702          ["black", "黒", "0.983"],
1703          ["green", "緑", "0.0072"]];
1704 
1705     string[][] data3x6ExpectedPermuteAlgoRNum2 =
1706         [["field_a", "field_b", "field_c"],
1707          ["black", "黒", "0.983"],
1708          ["red", "赤", "23.8"]];
1709 
1710     string[][] data3x6ExpectedPermuteAlgoRNum1 =
1711         [["field_a", "field_b", "field_c"],
1712          ["green", "緑", "0.0072"]];
1713 
1714     string[][] data3x6ExpectedBernoulliProbsP100 =
1715         [["random_value", "field_a", "field_b", "field_c"],
1716          ["0.010968807619065046", "red", "赤", "23.8"],
1717          ["0.15929344086907804", "green", "緑", "0.0072"],
1718          ["0.49287854949943721", "white", "白", "1.65"],
1719          ["0.96055546286515892", "yellow", "黄", "12"],
1720          ["0.52525980887003243", "blue", "青", "12"],
1721          ["0.7571015392895788", "black", "黒", "0.983"]];
1722 
1723     string[][] data3x6ExpectedBernoulliCompatProbsP60 =
1724         [["random_value", "field_a", "field_b", "field_c"],
1725          ["0.010968807619065046", "red", "赤", "23.8"],
1726          ["0.15929344086907804", "green", "緑", "0.0072"],
1727          ["0.49287854949943721", "white", "白", "1.65"],
1728          ["0.52525980887003243", "blue", "青", "12"]];
1729 
1730     string[][] data3x6ExpectedBernoulliSkipP40 =
1731         [["field_a", "field_b", "field_c"],
1732          ["red", "赤", "23.8"],
1733          ["green", "緑", "0.0072"],
1734          ["yellow", "黄", "12"]];
1735 
1736     string[][] data3x6ExpectedBernoulliCompatP60 =
1737         [["field_a", "field_b", "field_c"],
1738          ["red", "赤", "23.8"],
1739          ["green", "緑", "0.0072"],
1740          ["white", "白", "1.65"],
1741          ["blue", "青", "12"]];
1742 
1743     string[][] data3x6ExpectedDistinctK1K3P60 =
1744         [["field_a", "field_b", "field_c"],
1745          ["green", "緑", "0.0072"],
1746          ["white", "白", "1.65"],
1747          ["blue", "青", "12"]];
1748 
1749     string[][] data3x6ExpectedDistinctK1K3P60Probs =
1750         [["random_value", "field_a", "field_b", "field_c"],
1751          ["0", "green", "緑", "0.0072"],
1752          ["0", "white", "白", "1.65"],
1753          ["0", "blue", "青", "12"]];
1754 
1755     string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom =
1756         [["custom_random_value_header", "field_a", "field_b", "field_c"],
1757          ["0", "green", "緑", "0.0072"],
1758          ["0", "white", "白", "1.65"],
1759          ["0", "blue", "青", "12"]];
1760 
1761     string[][] data3x6ExpectedDistinctK2P2ProbsInorder =
1762         [["random_value", "field_a", "field_b", "field_c"],
1763          ["1", "red", "赤", "23.8"],
1764          ["0", "green", "緑", "0.0072"],
1765          ["0", "white", "白", "1.65"],
1766          ["1", "yellow", "黄", "12"],
1767          ["3", "blue", "青", "12"],
1768          ["2", "black", "黒", "0.983"]];
1769 
1770     string[][] data3x6ExpectedPermuteWt3Probs =
1771         [["random_value", "field_a", "field_b", "field_c"],
1772          ["0.9966519875764539", "yellow", "黄", "12"],
1773          ["0.94775884809836686", "blue", "青", "12"],
1774          ["0.82728234682286661", "red", "赤", "23.8"],
1775          ["0.75346697377181959", "black", "黒", "0.983"],
1776          ["0.65130103496422487", "white", "白", "1.65"],
1777          ["1.5636943712879866e-111", "green", "緑", "0.0072"]];
1778 
1779     string[][] data3x6ExpectedWt3ProbsInorder =
1780         [["random_value", "field_a", "field_b", "field_c"],
1781          ["0.82728234682286661", "red", "赤", "23.8"],
1782          ["1.5636943712879866e-111", "green", "緑", "0.0072"],
1783          ["0.65130103496422487", "white", "白", "1.65"],
1784          ["0.9966519875764539", "yellow", "黄", "12"],
1785          ["0.94775884809836686", "blue", "青", "12"],
1786          ["0.75346697377181959", "black", "黒", "0.983"]];
1787 
1788     string[][] data3x6ExpectedPermuteWt3 =
1789         [["field_a", "field_b", "field_c"],
1790          ["yellow", "黄", "12"],
1791          ["blue", "青", "12"],
1792          ["red", "赤", "23.8"],
1793          ["black", "黒", "0.983"],
1794          ["white", "白", "1.65"],
1795          ["green", "緑", "0.0072"]];
1796 
1797     string[][] data3x6ExpectedReplaceNum10 =
1798         [["field_a", "field_b", "field_c"],
1799          ["black", "黒", "0.983"],
1800          ["green", "緑", "0.0072"],
1801          ["green", "緑", "0.0072"],
1802          ["red", "赤", "23.8"],
1803          ["yellow", "黄", "12"],
1804          ["red", "赤", "23.8"],
1805          ["white", "白", "1.65"],
1806          ["yellow", "黄", "12"],
1807          ["yellow", "黄", "12"],
1808          ["white", "白", "1.65"],
1809         ];
1810 
1811     string[][] data3x6ExpectedReplaceNum10V77 =
1812         [["field_a", "field_b", "field_c"],
1813          ["black", "黒", "0.983"],
1814          ["red", "赤", "23.8"],
1815          ["black", "黒", "0.983"],
1816          ["yellow", "黄", "12"],
1817          ["green", "緑", "0.0072"],
1818          ["green", "緑", "0.0072"],
1819          ["green", "緑", "0.0072"],
1820          ["yellow", "黄", "12"],
1821          ["blue", "青", "12"],
1822          ["white", "白", "1.65"],
1823         ];
1824 
1825     /* Using a different static seed. */
1826     string[][] data3x6ExpectedPermuteCompatV41Probs =
1827         [["random_value", "field_a", "field_b", "field_c"],
1828          ["0.68057272653095424", "green", "緑", "0.0072"],
1829          ["0.67681624367833138", "blue", "青", "12"],
1830          ["0.32097338931635022", "yellow", "黄", "12"],
1831          ["0.25092361867427826", "red", "赤", "23.8"],
1832          ["0.15535934292711318", "black", "黒", "0.983"],
1833          ["0.04609582107514143", "white", "白", "1.65"]];
1834 
1835     string[][] data3x6ExpectedBernoulliCompatP60V41Probs =
1836         [["random_value", "field_a", "field_b", "field_c"],
1837          ["0.25092361867427826", "red", "赤", "23.8"],
1838          ["0.04609582107514143", "white", "白", "1.65"],
1839          ["0.32097338931635022", "yellow", "黄", "12"],
1840          ["0.15535934292711318", "black", "黒", "0.983"]];
1841 
1842     string[][] data3x6ExpectedPermuteWt3V41Probs =
1843         [["random_value", "field_a", "field_b", "field_c"],
1844          ["0.96799377498910666", "blue", "青", "12"],
1845          ["0.94356245792573568", "red", "赤", "23.8"],
1846          ["0.90964601024271996", "yellow", "黄", "12"],
1847          ["0.15491658409260103", "white", "白", "1.65"],
1848          ["0.15043620392537033", "black", "黒", "0.983"],
1849          ["6.1394674830701461e-24", "green", "緑", "0.0072"]];
1850 
1851     string[][] data3x6ExpectedWt3V41ProbsInorder =
1852         [["random_value", "field_a", "field_b", "field_c"],
1853          ["0.94356245792573568", "red", "赤", "23.8"],
1854          ["6.1394674830701461e-24", "green", "緑", "0.0072"],
1855          ["0.15491658409260103", "white", "白", "1.65"],
1856          ["0.90964601024271996", "yellow", "黄", "12"],
1857          ["0.96799377498910666", "blue", "青", "12"],
1858          ["0.15043620392537033", "black", "黒", "0.983"]];
1859 
1860 
1861     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
1862     string[][] combo1ExpectedPermuteCompat =
1863         [["field_a", "field_b", "field_c"],
1864          ["yellow", "黄", "12"],
1865          ["tan", "タン", "8.5"],
1866          ["brown", "褐色", "29.2"],
1867          ["green", "緑", "0.0072"],
1868          ["red", "赤", "23.8"],
1869          ["purple", "紫の", "42"],
1870          ["black", "黒", "0.983"],
1871          ["white", "白", "1.65"],
1872          ["gray", "グレー", "6.2"],
1873          ["blue", "青", "12"],
1874          ["pink", "ピンク", "1.1"],
1875          ["orange", "オレンジ", "2.5"]];
1876 
1877     string[][] combo1ExpectedPermuteCompatProbs =
1878         [["random_value", "field_a", "field_b", "field_c"],
1879          ["0.97088520275428891", "yellow", "黄", "12"],
1880          ["0.96055546286515892", "tan", "タン", "8.5"],
1881          ["0.81756894313730299", "brown", "褐色", "29.2"],
1882          ["0.7571015392895788", "green", "緑", "0.0072"],
1883          ["0.52525980887003243", "red", "赤", "23.8"],
1884          ["0.49287854949943721", "purple", "紫の", "42"],
1885          ["0.47081507067196071", "black", "黒", "0.983"],
1886          ["0.38388182921335101", "white", "白", "1.65"],
1887          ["0.29215990612283349", "gray", "グレー", "6.2"],
1888          ["0.24033216014504433", "blue", "青", "12"],
1889          ["0.15929344086907804", "pink", "ピンク", "1.1"],
1890          ["0.010968807619065046", "orange", "オレンジ", "2.5"]];
1891 
1892     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
1893     string[][] combo1ExpectedProbsInorder =
1894         [["random_value", "field_a", "field_b", "field_c"],
1895          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
1896          ["0.15929344086907804", "pink", "ピンク", "1.1"],
1897          ["0.49287854949943721", "purple", "紫の", "42"],
1898          ["0.96055546286515892", "tan", "タン", "8.5"],
1899          ["0.52525980887003243", "red", "赤", "23.8"],
1900          ["0.7571015392895788", "green", "緑", "0.0072"],
1901          ["0.38388182921335101", "white", "白", "1.65"],
1902          ["0.97088520275428891", "yellow", "黄", "12"],
1903          ["0.24033216014504433", "blue", "青", "12"],
1904          ["0.47081507067196071", "black", "黒", "0.983"],
1905          ["0.81756894313730299", "brown", "褐色", "29.2"],
1906          ["0.29215990612283349", "gray", "グレー", "6.2"]];
1907 
1908     string[][] combo1ExpectedBernoulliCompatP50Probs =
1909         [["random_value", "field_a", "field_b", "field_c"],
1910          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
1911          ["0.15929344086907804", "pink", "ピンク", "1.1"],
1912          ["0.49287854949943721", "purple", "紫の", "42"],
1913          ["0.38388182921335101", "white", "白", "1.65"],
1914          ["0.24033216014504433", "blue", "青", "12"],
1915          ["0.47081507067196071", "black", "黒", "0.983"],
1916          ["0.29215990612283349", "gray", "グレー", "6.2"]];
1917 
1918     string[][] combo1ExpectedBernoulliCompatP40 =
1919         [["field_a", "field_b", "field_c"],
1920          ["orange", "オレンジ", "2.5"],
1921          ["pink", "ピンク", "1.1"],
1922          ["white", "白", "1.65"],
1923          ["blue", "青", "12"],
1924          ["gray", "グレー", "6.2"]];
1925 
1926     string[][] combo1ExpectedDistinctK1P40 =
1927         [["field_a", "field_b", "field_c"],
1928          ["orange", "オレンジ", "2.5"],
1929          ["red", "赤", "23.8"],
1930          ["green", "緑", "0.0072"],
1931          ["blue", "青", "12"],
1932          ["black", "黒", "0.983"]];
1933 
1934     string[][] combo1ExpectedPermuteWt3Probs =
1935         [["random_value", "field_a", "field_b", "field_c"],
1936          ["0.99754077523718754", "yellow", "黄", "12"],
1937          ["0.99527665440088786", "tan", "タン", "8.5"],
1938          ["0.99312578945741659", "brown", "褐色", "29.2"],
1939          ["0.98329602553389361", "purple", "紫の", "42"],
1940          ["0.9733096193808366", "red", "赤", "23.8"],
1941          ["0.88797551521739648", "blue", "青", "12"],
1942          ["0.81999230489041786", "gray", "グレー", "6.2"],
1943          ["0.55975569204250941", "white", "白", "1.65"],
1944          ["0.46472135609205739", "black", "黒", "0.983"],
1945          ["0.18824582704191337", "pink", "ピンク", "1.1"],
1946          ["0.1644613185329992", "orange", "オレンジ", "2.5"],
1947          ["1.6438086931020549e-17", "green", "緑", "0.0072"]];
1948 
1949     string[][] combo1ExpectedPermuteWt3 =
1950         [["field_a", "field_b", "field_c"],
1951          ["yellow", "黄", "12"],
1952          ["tan", "タン", "8.5"],
1953          ["brown", "褐色", "29.2"],
1954          ["purple", "紫の", "42"],
1955          ["red", "赤", "23.8"],
1956          ["blue", "青", "12"],
1957          ["gray", "グレー", "6.2"],
1958          ["white", "白", "1.65"],
1959          ["black", "黒", "0.983"],
1960          ["pink", "ピンク", "1.1"],
1961          ["orange", "オレンジ", "2.5"],
1962          ["green", "緑", "0.0072"]];
1963 
1964         string[][] combo1ExpectedPermuteAlgoRNum4 =
1965         [["field_a", "field_b", "field_c"],
1966          ["blue", "青", "12"],
1967          ["gray", "グレー", "6.2"],
1968          ["brown", "褐色", "29.2"],
1969          ["white", "白", "1.65"]];
1970 
1971     string[][] combo1ExpectedReplaceNum10 =
1972         [["field_a", "field_b", "field_c"],
1973          ["gray", "グレー", "6.2"],
1974          ["yellow", "黄", "12"],
1975          ["yellow", "黄", "12"],
1976          ["white", "白", "1.65"],
1977          ["tan", "タン", "8.5"],
1978          ["white", "白", "1.65"],
1979          ["blue", "青", "12"],
1980          ["black", "黒", "0.983"],
1981          ["tan", "タン", "8.5"],
1982          ["purple", "紫の", "42"]];
1983 
1984     /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */
1985     string[][] data1x200 =
1986         [["field_a"],
1987          ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"],
1988          ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"],
1989          ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"],
1990          ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"],
1991          ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"],
1992          ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"],
1993          ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"],
1994          ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"],
1995          ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"],
1996          ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"],
1997          ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"],
1998          ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"],
1999          ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"],
2000          ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"],
2001          ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"],
2002          ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"],
2003          ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"],
2004          ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"],
2005          ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"],
2006          ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"],
2007         ];
2008 
2009     string fpath_data1x200 = buildPath(testDir, "data1x200.tsv");
2010     string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv");
2011     writeUnittestTsvFile(fpath_data1x200, data1x200);
2012     writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1..$]);
2013 
2014     string[][] data1x200ExpectedBernoulliSkipV333P01 =
2015         [["field_a"],
2016          ["077"],
2017          ["119"]];
2018 
2019     string[][] data1x200ExpectedBernoulliSkipV333P02 =
2020         [["field_a"],
2021          ["038"],
2022          ["059"],
2023          ["124"],
2024          ["161"],
2025          ["162"],
2026          ["183"]];
2027 
2028     string[][] data1x200ExpectedBernoulliSkipV333P03 =
2029         [["field_a"],
2030          ["025"],
2031          ["039"],
2032          ["082"],
2033          ["107"],
2034          ["108"],
2035          ["122"],
2036          ["136"],
2037          ["166"],
2038          ["182"]];
2039 
2040     string[][] data1x200ExpectedBernoulliCompatV333P01 =
2041         [["field_a"],
2042          ["072"]];
2043 
2044     string[][] data1x200ExpectedBernoulliCompatV333P02 =
2045         [["field_a"],
2046          ["004"],
2047          ["072"]];
2048 
2049     string[][] data1x200ExpectedBernoulliCompatV333P03 =
2050         [["field_a"],
2051          ["004"],
2052          ["072"],
2053          ["181"]];
2054 
2055     /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files,
2056      * only expected results. The header is from 3x0, the results are offset 1-position
2057      * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line.
2058      */
2059     string[][] combo2ExpectedBernoulliSkipV333P03 =
2060         [["field_a", "field_b", "field_c"],
2061          ["024"],
2062          ["038"],
2063          ["081"],
2064          ["106"],
2065          ["107"],
2066          ["121"],
2067          ["135"],
2068          ["165"],
2069          ["181"]];
2070 
2071 
2072     /* 1x10 - Simple 1-column file. */
2073     string[][] data1x10 =
2074         [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]];
2075     string fpath_data1x10 = buildPath(testDir, "data1x10.tsv");
2076     string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv");
2077     writeUnittestTsvFile(fpath_data1x10, data1x10);
2078     writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1..$]);
2079 
2080     string[][] data1x10ExpectedPermuteCompat =
2081         [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]];
2082 
2083     string[][] data1x10ExpectedPermuteWt1 =
2084         [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]];
2085 
2086     /* 2x10a - Uniform distribution [0,1]. */
2087     string[][] data2x10a =
2088         [["line", "weight"],
2089          ["1", "0.26788837"],
2090          ["2", "0.06601298"],
2091          ["3", "0.38627527"],
2092          ["4", "0.47379424"],
2093          ["5", "0.02966641"],
2094          ["6", "0.05636231"],
2095          ["7", "0.70529242"],
2096          ["8", "0.91836862"],
2097          ["9", "0.99103720"],
2098          ["10", "0.31401740"]];
2099 
2100     string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv");
2101     writeUnittestTsvFile(fpath_data2x10a, data2x10a);
2102 
2103     string[][] data2x10aExpectedPermuteWt2Probs =
2104         [["random_value", "line", "weight"],
2105          ["0.96833865494543658", "8", "0.91836862"],
2106          ["0.91856842054413923", "4", "0.47379424"],
2107          ["0.25730832087795091", "7", "0.70529242"],
2108          ["0.2372531790701812", "9", "0.99103720"],
2109          ["0.16016096701872204", "3", "0.38627527"],
2110          ["0.090819662667243381", "10", "0.31401740"],
2111          ["0.0071764539244361172", "6", "0.05636231"],
2112          ["4.8318642951630057e-08", "1", "0.26788837"],
2113          ["3.7525692966535517e-10", "5", "0.02966641"],
2114          ["8.2123247880095796e-13", "2", "0.06601298"]];
2115 
2116     /* 2x10b - Uniform distribution [0,1000]. */
2117     string[][] data2x10b =
2118         [["line", "weight"],
2119          ["1", "761"],
2120          ["2", "432"],
2121          ["3", "103"],
2122          ["4", "448"],
2123          ["5", "750"],
2124          ["6", "711"],
2125          ["7", "867"],
2126          ["8", "841"],
2127          ["9", "963"],
2128          ["10", "784"]];
2129 
2130     string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv");
2131     writeUnittestTsvFile(fpath_data2x10b, data2x10b);
2132 
2133     string[][] data2x10bExpectedPermuteWt2Probs =
2134         [["random_value", "line", "weight"],
2135          ["0.99996486739067969", "8", "841"],
2136          ["0.99991017467137211", "4", "448"],
2137          ["0.99960871524873662", "6", "711"],
2138          ["0.999141885371438", "5", "750"],
2139          ["0.99903963250274785", "10", "784"],
2140          ["0.99889631825931946", "7", "867"],
2141          ["0.99852058315191139", "9", "963"],
2142          ["0.99575669679158918", "2", "432"],
2143          ["0.99408758732050595", "1", "761"],
2144          ["0.99315467761212362", "3", "103"]];
2145 
2146     /* 2x10c - Logarithmic distribution in random order. */
2147     string[][] data2x10c =
2148         [["line", "weight"],
2149          ["1", "31.85"],
2150          ["2", "17403.31"],
2151          ["3", "653.84"],
2152          ["4", "8.23"],
2153          ["5", "2671.04"],
2154          ["6", "26226.08"],
2155          ["7", "1.79"],
2156          ["8", "354.56"],
2157          ["9", "35213.81"],
2158          ["10", "679.29"]];
2159 
2160     string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv");
2161     writeUnittestTsvFile(fpath_data2x10c, data2x10c);
2162 
2163     string[][] data2x10cExpectedPermuteWt2Probs =
2164         [["random_value", "line", "weight"],
2165          ["0.99998939008709697", "6", "26226.08"],
2166          ["0.99995951291695517", "9", "35213.81"],
2167          ["0.99991666907613541", "8", "354.56"],
2168          ["0.9998944505218641", "2", "17403.31"],
2169          ["0.9997589760286163", "5", "2671.04"],
2170          ["0.99891852769877643", "3", "653.84"],
2171          ["0.99889167752782515", "10", "679.29"],
2172          ["0.99512207506850148", "4", "8.23"],
2173          ["0.86789371584259023", "1", "31.85"],
2174          ["0.5857443816291561", "7", "1.79"]];
2175 
2176     /* 2x10d. Logarithmic distribution in ascending order. */
2177     string[][] data2x10d =
2178         [["line", "weight"],
2179          ["1", "1.79"],
2180          ["2", "8.23"],
2181          ["3", "31.85"],
2182          ["4", "354.56"],
2183          ["5", "653.84"],
2184          ["6", "679.29"],
2185          ["7", "2671.04"],
2186          ["8", "17403.31"],
2187          ["9", "26226.08"],
2188          ["10", "35213.81"]];
2189 
2190     string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv");
2191     writeUnittestTsvFile(fpath_data2x10d, data2x10d);
2192 
2193     string[][] data2x10dExpectedPermuteWt2Probs =
2194         [["random_value", "line", "weight"],
2195          ["0.99999830221846353", "8", "17403.31"],
2196          ["0.99997860834041397", "10", "35213.81"],
2197          ["0.99994563828986716", "9", "26226.08"],
2198          ["0.99988650363575737", "4", "354.56"],
2199          ["0.99964161939190088", "7", "2671.04"],
2200          ["0.99959045338948649", "6", "679.29"],
2201          ["0.99901574490639788", "5", "653.84"],
2202          ["0.97803163304747431", "3", "31.85"],
2203          ["0.79994791806910948", "2", "8.23"],
2204          ["0.080374261239949119", "1", "1.79"]];
2205 
2206     /* 2x10e. Logarithmic distribution in descending order. */
2207     string[][] data2x10e =
2208         [["line", "weight"],
2209          ["1", "35213.81"],
2210          ["2", "26226.08"],
2211          ["3", "17403.31"],
2212          ["4", "2671.04"],
2213          ["5", "679.29"],
2214          ["6", "653.84"],
2215          ["7", "354.56"],
2216          ["8", "31.85"],
2217          ["9", "8.23"],
2218          ["10", "1.79"]];
2219     string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv");
2220     writeUnittestTsvFile(fpath_data2x10e, data2x10e);
2221 
2222     string[][] data2x10eExpectedPermuteWt2Probs =
2223         [["random_value", "line", "weight"],
2224          ["0.99998493348975237", "4", "2671.04"],
2225          ["0.99995934807202624", "3", "17403.31"],
2226          ["0.99992995739727453", "2", "26226.08"],
2227          ["0.99987185679245649", "1", "35213.81"],
2228          ["0.99957451563173938", "6", "653.84"],
2229          ["0.99907273650209583", "8", "31.85"],
2230          ["0.99905260312968946", "5", "679.29"],
2231          ["0.99730333650516401", "7", "354.56"],
2232          ["0.84093902435227808", "9", "8.23"],
2233          ["0.65650015926290028", "10", "1.79"]];
2234 
2235     /* Data sets for distinct sampling. */
2236     string[][] data5x25 =
2237         [["ID", "Shape", "Color", "Size", "Weight"],
2238          ["01", "circle", "red", "S", "10"],
2239          ["02", "circle", "black", "L", "20"],
2240          ["03", "square", "black", "L", "20"],
2241          ["04", "circle", "green", "L", "30"],
2242          ["05", "ellipse", "red", "S", "20"],
2243          ["06", "triangle", "red", "S", "10"],
2244          ["07", "triangle", "red", "L", "20"],
2245          ["08", "square", "black", "S", "10"],
2246          ["09", "circle", "black", "S", "20"],
2247          ["10", "square", "green", "L", "20"],
2248          ["11", "triangle", "red", "L", "20"],
2249          ["12", "circle", "green", "L", "30"],
2250          ["13", "ellipse", "red", "S", "20"],
2251          ["14", "circle", "green", "L", "30"],
2252          ["15", "ellipse", "red", "L", "30"],
2253          ["16", "square", "red", "S", "10"],
2254          ["17", "circle", "black", "L", "20"],
2255          ["18", "square", "red", "S", "20"],
2256          ["19", "square", "black", "L", "20"],
2257          ["20", "circle", "red", "S", "10"],
2258          ["21", "ellipse", "black", "L", "30"],
2259          ["22", "triangle", "red", "L", "30"],
2260          ["23", "circle", "green", "S", "20"],
2261          ["24", "square", "green", "L", "20"],
2262          ["25", "circle", "red", "S", "10"],
2263         ];
2264 
2265     string fpath_data5x25 = buildPath(testDir, "data5x25.tsv");
2266     string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv");
2267     writeUnittestTsvFile(fpath_data5x25, data5x25);
2268     writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1..$]);
2269 
2270     string[][] data5x25ExpectedDistinctK2P40 =
2271         [["ID", "Shape", "Color", "Size", "Weight"],
2272          ["03", "square", "black", "L", "20"],
2273          ["05", "ellipse", "red", "S", "20"],
2274          ["08", "square", "black", "S", "10"],
2275          ["10", "square", "green", "L", "20"],
2276          ["13", "ellipse", "red", "S", "20"],
2277          ["15", "ellipse", "red", "L", "30"],
2278          ["16", "square", "red", "S", "10"],
2279          ["18", "square", "red", "S", "20"],
2280          ["19", "square", "black", "L", "20"],
2281          ["21", "ellipse", "black", "L", "30"],
2282          ["24", "square", "green", "L", "20"],
2283         ];
2284 
2285     string[][] data5x25ExpectedDistinctK2K4P20 =
2286         [["ID", "Shape", "Color", "Size", "Weight"],
2287          ["03", "square", "black", "L", "20"],
2288          ["07", "triangle", "red", "L", "20"],
2289          ["08", "square", "black", "S", "10"],
2290          ["10", "square", "green", "L", "20"],
2291          ["11", "triangle", "red", "L", "20"],
2292          ["16", "square", "red", "S", "10"],
2293          ["18", "square", "red", "S", "20"],
2294          ["19", "square", "black", "L", "20"],
2295          ["22", "triangle", "red", "L", "30"],
2296          ["24", "square", "green", "L", "20"],
2297         ];
2298 
2299     string[][] data5x25ExpectedDistinctK2K3K4P20 =
2300         [["ID", "Shape", "Color", "Size", "Weight"],
2301          ["04", "circle", "green", "L", "30"],
2302          ["07", "triangle", "red", "L", "20"],
2303          ["09", "circle", "black", "S", "20"],
2304          ["11", "triangle", "red", "L", "20"],
2305          ["12", "circle", "green", "L", "30"],
2306          ["14", "circle", "green", "L", "30"],
2307          ["16", "square", "red", "S", "10"],
2308          ["18", "square", "red", "S", "20"],
2309          ["22", "triangle", "red", "L", "30"],
2310         ];
2311 
2312     /* Fields 2 and 4 from data5x25. Distinct rows should be the same for equiv keys. */
2313     string[][] data2x25 =
2314         [["Shape", "Size"],
2315          ["circle", "S"],
2316          ["circle", "L"],
2317          ["square", "L"],
2318          ["circle", "L"],
2319          ["ellipse", "S"],
2320          ["triangle", "S"],
2321          ["triangle", "L"],
2322          ["square", "S"],
2323          ["circle", "S"],
2324          ["square", "L"],
2325          ["triangle", "L"],
2326          ["circle", "L"],
2327          ["ellipse", "S"],
2328          ["circle", "L"],
2329          ["ellipse", "L"],
2330          ["square", "S"],
2331          ["circle", "L"],
2332          ["square", "S"],
2333          ["square", "L"],
2334          ["circle", "S"],
2335          ["ellipse", "L"],
2336          ["triangle", "L"],
2337          ["circle", "S"],
2338          ["square", "L"],
2339          ["circle", "S"],
2340         ];
2341 
2342     string fpath_data2x25 = buildPath(testDir, "data2x25.tsv");
2343     string fpath_data2x25_noheader = buildPath(testDir, "data2x25_noheader.tsv");
2344     writeUnittestTsvFile(fpath_data2x25, data2x25);
2345     writeUnittestTsvFile(fpath_data2x25_noheader, data2x25[1..$]);
2346 
2347     string[][] data2x25ExpectedDistinctK1K2P20 =
2348         [["Shape", "Size"],
2349          ["square", "L"],
2350          ["triangle", "L"],
2351          ["square", "S"],
2352          ["square", "L"],
2353          ["triangle", "L"],
2354          ["square", "S"],
2355          ["square", "S"],
2356          ["square", "L"],
2357          ["triangle", "L"],
2358          ["square", "L"],
2359         ];
2360 
2361     string[][] data1x25 =
2362         [["Shape-Size"],
2363          ["circle-S"],
2364          ["circle-L"],
2365          ["square-L"],
2366          ["circle-L"],
2367          ["ellipse-S"],
2368          ["triangle-S"],
2369          ["triangle-L"],
2370          ["square-S"],
2371          ["circle-S"],
2372          ["square-L"],
2373          ["triangle-L"],
2374          ["circle-L"],
2375          ["ellipse-S"],
2376          ["circle-L"],
2377          ["ellipse-L"],
2378          ["square-S"],
2379          ["circle-L"],
2380          ["square-S"],
2381          ["square-L"],
2382          ["circle-S"],
2383          ["ellipse-L"],
2384          ["triangle-L"],
2385          ["circle-S"],
2386          ["square-L"],
2387          ["circle-S"],
2388         ];
2389 
2390     string fpath_data1x25 = buildPath(testDir, "data1x25.tsv");
2391     string fpath_data1x25_noheader = buildPath(testDir, "data1x25_noheader.tsv");
2392     writeUnittestTsvFile(fpath_data1x25, data1x25);
2393     writeUnittestTsvFile(fpath_data1x25_noheader, data1x25[1..$]);
2394 
2395     string[][] data1x25ExpectedDistinctK1P20 =
2396         [["Shape-Size"],
2397          ["triangle-L"],
2398          ["square-S"],
2399          ["triangle-L"],
2400          ["ellipse-L"],
2401          ["square-S"],
2402          ["square-S"],
2403          ["ellipse-L"],
2404          ["triangle-L"],
2405         ];
2406 
2407     string[][] data1x25ExpectedDistinctK1P20Probs =
2408         [["random_value", "Shape-Size"],
2409          ["0", "triangle-L"],
2410          ["0", "square-S"],
2411          ["0", "triangle-L"],
2412          ["0", "ellipse-L"],
2413          ["0", "square-S"],
2414          ["0", "square-S"],
2415          ["0", "ellipse-L"],
2416          ["0", "triangle-L"],
2417         ];
2418 
2419     string[][] data1x25ExpectedDistinctK1P20ProbsInorder =
2420         [["random_value", "Shape-Size"],
2421          ["1", "circle-S"],
2422          ["4", "circle-L"],
2423          ["2", "square-L"],
2424          ["4", "circle-L"],
2425          ["2", "ellipse-S"],
2426          ["1", "triangle-S"],
2427          ["0", "triangle-L"],
2428          ["0", "square-S"],
2429          ["1", "circle-S"],
2430          ["2", "square-L"],
2431          ["0", "triangle-L"],
2432          ["4", "circle-L"],
2433          ["2", "ellipse-S"],
2434          ["4", "circle-L"],
2435          ["0", "ellipse-L"],
2436          ["0", "square-S"],
2437          ["4", "circle-L"],
2438          ["0", "square-S"],
2439          ["2", "square-L"],
2440          ["1", "circle-S"],
2441          ["0", "ellipse-L"],
2442          ["0", "triangle-L"],
2443          ["1", "circle-S"],
2444          ["2", "square-L"],
2445          ["1", "circle-S"],
2446         ];
2447 
2448     /*
2449      * Enough setup! Actually run some tests!
2450      */
2451 
2452     /* Permutations. Headers, static seed, compatibility mode. With weights and without. */
2453     testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty);
2454     testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0);
2455     testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1);
2456     testTsvSample(["test-a4", "-H", "-s", "--compatibility-mode", fpath_data3x2], data3x2PermuteCompat);
2457     testTsvSample(["test-a5", "-H", "-s", "--compatibility-mode", fpath_data3x3], data3x3ExpectedPermuteCompat);
2458     testTsvSample(["test-a6", "-H", "-s", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat);
2459     testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
2460     testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3);
2461     testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
2462     testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
2463     testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
2464     testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
2465     testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs);
2466 
2467     /* Permutations, without compatibility mode, or with both compatibility and printing. */
2468     testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty);
2469     testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0);
2470     testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1);
2471     testTsvSample(["test-aa4", "-H", "-s", fpath_data3x2], data3x2PermuteShuffle);
2472     testTsvSample(["test-aa5", "-H", "-s", fpath_data3x3], data3x3ExpectedPermuteSwap);
2473     testTsvSample(["test-aa6", "-H", "-s", fpath_data3x6], data3x6ExpectedPermuteSwap);
2474     testTsvSample(["test-aa7", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3);
2475     testTsvSample(["test-aa8", "-H", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
2476     testTsvSample(["test-aa9", "-H", "--seed-value", "41", "--print-random", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
2477 
2478     /* Reservoir sampling using Algorithm R.
2479      * (Note: reservoirSamplingViaHeap is tested later in the length-based iteration loops.)
2480      */
2481     testTsvSample(["test-aa10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty);
2482     testTsvSample(["test-aa11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty);
2483     testTsvSample(["test-aa12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x0], data3x0);
2484     testTsvSample(["test-aa13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x0], data3x0);
2485     testTsvSample(["test-aa14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x1], data3x1);
2486     testTsvSample(["test-aa15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x1], data3x1);
2487     testTsvSample(["test-aa16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum6);
2488     testTsvSample(["test-aa17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum6);
2489     testTsvSample(["test-aa18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum5);
2490     testTsvSample(["test-aa19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum4);
2491     testTsvSample(["test-aa20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum3);
2492     testTsvSample(["test-aa21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum2);
2493     testTsvSample(["test-aa22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum1);
2494 
2495     /* Bernoulli sampling cases. */
2496     testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty);
2497     testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0);
2498     testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1);
2499     testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6);
2500     testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6);
2501     testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
2502     testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60);
2503     testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliCompatP60);
2504     testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs);
2505 
2506     /* Bernoulli sampling with probabilities in skip sampling range or preferring skip sampling. */
2507     testTsvSample(["test-ab1", "-H", "--seed-value", "333", "--prob", "0.01", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P01);
2508     testTsvSample(["test-ab2", "-H", "--seed-value", "333", "--prob", "0.02", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P02);
2509     testTsvSample(["test-ab3", "-H", "--seed-value", "333", "--prob", "0.03", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03);
2510     testTsvSample(["test-ab4", "-H", "--seed-value", "333", "--prob", "0.01", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P01);
2511     testTsvSample(["test-ab5", "-H", "--seed-value", "333", "--prob", "0.02", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P02);
2512     testTsvSample(["test-ab6", "-H", "--seed-value", "333", "--prob", "0.03", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P03);
2513     testTsvSample(["test-ab7", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40);
2514 
2515     /* Distinct sampling cases. */
2516     testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty);
2517     testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0);
2518     testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1);
2519     testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6);
2520     testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctK1K3P60);
2521 
2522 
2523 
2524     /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling.
2525      * For weighted sampling, use the weighted cases, but with expected using the original ordering.
2526      */
2527     testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
2528     testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
2529     testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
2530                   data3x6ExpectedWt3ProbsInorder);
2531     testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
2532                   data3x6ExpectedWt3V41ProbsInorder);
2533     testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6],
2534                   data3x6ExpectedDistinctK1K3P60Probs);
2535     testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header",
2536                    "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctK1K3P60ProbsRVCustom);
2537     testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6],
2538                   data3x6ExpectedDistinctK2P2ProbsInorder);
2539 
2540     /* Simple random sampling with replacement. */
2541     testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty);
2542     testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty);
2543     testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0);
2544     testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0);
2545     testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplaceNum3);
2546     testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10);
2547     testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77);
2548 
2549     /* Permutations, compatibility mode, without headers. */
2550     testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1..$]);
2551     testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1..$]);
2552     testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1..$]);
2553     testTsvSample(["test-b4", "-s", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..$]);
2554     testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..$]);
2555     testTsvSample(["test-b6", "-s", "--weight-field", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..$]);
2556     testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..$]);
2557     testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1..$]);
2558     testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1..$]);
2559 
2560     /* Permutations, no headers, without compatibility mode, or with printing and compatibility mode. */
2561     testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1..$]);
2562     testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1..$]);
2563     testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1..$]);
2564     testTsvSample(["test-bb4", "-s", fpath_data3x6_noheader], data3x6ExpectedPermuteSwap[1..$]);
2565     testTsvSample(["test-bb5", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..$]);
2566     testTsvSample(["test-bb6", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..$]);
2567     testTsvSample(["test-bb7", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1..$]);
2568 
2569     /* Reservoir sampling using Algorithm R, no headers. */
2570     testTsvSample(["test-aa10", "--prefer-algorithm-r", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty);
2571     testTsvSample(["test-aa11", "--prefer-algorithm-r", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty);
2572     testTsvSample(["test-aa14", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x1_noheader], data3x1[1..$]);
2573     testTsvSample(["test-aa15", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x1_noheader], data3x1[1..$]);
2574     testTsvSample(["test-aa16", "--prefer-algorithm-r", "-s", "--num", "7", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum6[1..$]);
2575     testTsvSample(["test-aa17", "--prefer-algorithm-r", "-s", "--num", "6", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum6[1..$]);
2576     testTsvSample(["test-aa18", "--prefer-algorithm-r", "-s", "--num", "5", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum5[1..$]);
2577     testTsvSample(["test-aa19", "--prefer-algorithm-r", "-s", "--num", "4", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum4[1..$]);
2578     testTsvSample(["test-aa20", "--prefer-algorithm-r", "-s", "--num", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum3[1..$]);
2579     testTsvSample(["test-aa21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum2[1..$]);
2580     testTsvSample(["test-aa22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum1[1..$]);
2581 
2582     /* Bernoulli sampling cases. */
2583     testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1..$]);
2584     testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]);
2585     testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]);
2586     testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..$]);
2587     testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..$]);
2588     testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1..$]);
2589 
2590     /* Bernoulli sampling with probabilities in skip sampling range. */
2591     testTsvSample(["test-bb1", "-v", "333", "-p", "0.01", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P01[1..$]);
2592     testTsvSample(["test-bb2", "-v", "333", "-p", "0.02", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P02[1..$]);
2593     testTsvSample(["test-bb3", "-v", "333", "-p", "0.03", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..$]);
2594     testTsvSample(["test-bb4", "-v", "333", "-p", "0.01", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P01[1..$]);
2595     testTsvSample(["test-bb5", "-v", "333", "-p", "0.02", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P02[1..$]);
2596     testTsvSample(["test-bb6", "-v", "333", "-p", "0.03", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P03[1..$]);
2597     testTsvSample(["test-bb7", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1..$]);
2598 
2599     /* Distinct sampling cases. */
2600     testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1..$]);
2601     testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]);
2602     testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]);
2603     testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]);
2604 
2605     /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */
2606     testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..$]);
2607     testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1..$]);
2608     testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader],
2609                   data3x6ExpectedDistinctK1K3P60Probs[1..$]);
2610     testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader],
2611                   data3x6ExpectedDistinctK2P2ProbsInorder[1..$]);
2612 
2613     /* Simple random sampling with replacement. */
2614     testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty);
2615     testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty);
2616     testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplaceNum3[1..$]);
2617     testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10[1..$]);
2618     testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1..$]);
2619 
2620     /* Multi-file tests. */
2621     testTsvSample(["test-c1", "--header", "--static-seed", "--compatibility-mode",
2622                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2623                   combo1ExpectedPermuteCompat);
2624     testTsvSample(["test-c2", "--header", "--static-seed", "--print-random",
2625                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2626                   combo1ExpectedPermuteCompatProbs);
2627     testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3",
2628                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2629                   combo1ExpectedPermuteWt3Probs);
2630     testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", "--compatibility-mode",
2631                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2632                   combo1ExpectedPermuteWt3);
2633     testTsvSample(["test-c5", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4",
2634                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2635                   combo1ExpectedPermuteAlgoRNum4);
2636 
2637     /* Multi-file, no headers. */
2638     testTsvSample(["test-c6", "--static-seed", "--compatibility-mode",
2639                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2640                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2641                   combo1ExpectedPermuteCompat[1..$]);
2642     testTsvSample(["test-c7", "--static-seed", "--print-random",
2643                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2644                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2645                   combo1ExpectedPermuteCompatProbs[1..$]);
2646     testTsvSample(["test-c8", "--static-seed", "--print-random", "--weight-field", "3",
2647                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2648                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2649                   combo1ExpectedPermuteWt3Probs[1..$]);
2650     testTsvSample(["test-c9", "--static-seed", "--weight-field", "3", "--compatibility-mode",
2651                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2652                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2653                   combo1ExpectedPermuteWt3[1..$]);
2654     testTsvSample(["test-c10", "--static-seed", "--prefer-algorithm-r", "--num", "4",
2655                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2656                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2657                   combo1ExpectedPermuteAlgoRNum4[1..$]);
2658 
2659     /* Bernoulli sampling cases. */
2660     testTsvSample(["test-c11", "--header", "--static-seed", "--print-random", "--prob", ".5",
2661                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2662                   combo1ExpectedBernoulliCompatP50Probs);
2663     testTsvSample(["test-c12", "--header", "--static-seed", "--prob", ".4",
2664                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2665                   combo1ExpectedBernoulliCompatP40);
2666     testTsvSample(["test-c13", "--static-seed", "--print-random", "--prob", ".5",
2667                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2668                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2669                   combo1ExpectedBernoulliCompatP50Probs[1..$]);
2670     testTsvSample(["test-c14", "--static-seed", "--prob", ".4",
2671                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2672                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2673                   combo1ExpectedBernoulliCompatP40[1..$]);
2674 
2675     /* Bernoulli sampling with probabilities in skip sampling range. */
2676     testTsvSample(["test-cc1", "-H", "-v", "333", "-p", "0.03",
2677                    fpath_data3x0, fpath_data3x1, fpath_data1x200, fpath_dataEmpty, fpath_data1x10],
2678                   combo2ExpectedBernoulliSkipV333P03);
2679     testTsvSample(["test-cc1", "-v", "333", "-p", "0.03",
2680                    fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader],
2681                   combo2ExpectedBernoulliSkipV333P03[1..$]);
2682 
2683     /* Distinct sampling cases. */
2684     testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4",
2685                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2686                   combo1ExpectedDistinctK1P40);
2687     testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--prob", ".4",
2688                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2689                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2690                   combo1ExpectedDistinctK1P40[1..$]);
2691 
2692     /* Generating random weights. */
2693     testTsvSample(["test-c15", "--header", "--static-seed", "--gen-random-inorder",
2694                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2695                   combo1ExpectedProbsInorder);
2696     testTsvSample(["test-c16", "--static-seed", "--gen-random-inorder",
2697                    fpath_data3x3_noheader, fpath_data3x1_noheader,
2698                    fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader],
2699                   combo1ExpectedProbsInorder[1..$]);
2700 
2701     /* Simple random sampling with replacement. */
2702     testTsvSample(["test-c17", "--header", "--static-seed", "--replace", "--num", "10",
2703                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
2704                   combo1ExpectedReplaceNum10);
2705 
2706     testTsvSample(["test-c18", "--static-seed", "--replace", "--num", "10",
2707                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
2708                    fpath_data3x6_noheader, fpath_data3x2_noheader],
2709                   combo1ExpectedReplaceNum10[1..$]);
2710 
2711     /* Single column file. */
2712     testTsvSample(["test-d1", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat);
2713     testTsvSample(["test-d2", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat);
2714 
2715     /* Distributions. */
2716     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs);
2717     testTsvSample(["test-e2", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedPermuteWt2Probs);
2718     testTsvSample(["test-e3", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedPermuteWt2Probs);
2719     testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs);
2720     testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs);
2721 
2722     /* Tests of subset sample (--n|num) field.
2723      *
2724      * Note: The way these tests are done ensures that subset length does not affect
2725      * output order.
2726      */
2727     import std.algorithm : min;
2728     for (size_t n = data3x6.length + 2; n >= 1; n--)
2729     {
2730         /* reservoirSamplingViaHeap.
2731          */
2732         size_t expectedLength = min(data3x6.length, n + 1);
2733         testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string,
2734                        "-H", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]);
2735 
2736         testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string,
2737                        "-H", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]);
2738 
2739         testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string,
2740                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs[0..expectedLength]);
2741 
2742         testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string,
2743                        "-H", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3[0..expectedLength]);
2744 
2745         testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string,
2746                        "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs[0..expectedLength]);
2747 
2748         testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string,
2749                        fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..expectedLength]);
2750 
2751         testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string,
2752                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..expectedLength]);
2753 
2754         testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string,
2755                        "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..expectedLength]);
2756 
2757         testTsvSample([format("test-f9_%d", n), "-s", "-n", n.to!string,
2758                        "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..expectedLength]);
2759 
2760         /* Bernoulli sampling.
2761          */
2762         import std.algorithm : min;
2763         size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedBernoulliCompatProbsP60.length);
2764 
2765         testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
2766                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60[0..sampleExpectedLength]);
2767 
2768         testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
2769                        "-H", fpath_data3x6], data3x6ExpectedBernoulliCompatP60[0..sampleExpectedLength]);
2770 
2771         testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
2772                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..sampleExpectedLength]);
2773 
2774         testTsvSample([format("test-f13_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
2775                        fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60[1..sampleExpectedLength]);
2776 
2777         /* Distinct Sampling.
2778          */
2779         size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctK1K3P60.length);
2780 
2781         testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
2782                        "-H", fpath_data3x6], data3x6ExpectedDistinctK1K3P60[0..distinctExpectedLength]);
2783 
2784         testTsvSample([format("test-f15_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
2785                        fpath_data3x6_noheader], data3x6ExpectedDistinctK1K3P60[1..distinctExpectedLength]);
2786 
2787         testTsvSample([format("test-f16_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
2788                        "-H", fpath_data3x6], data3x6ExpectedBernoulliProbsP100[0..expectedLength]);
2789 
2790         testTsvSample([format("test-f17_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
2791                        fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]);
2792     }
2793 
2794     /* Similar tests with the 1x10 data set. */
2795     for (size_t n = data1x10.length + 2; n >= 1; n--)
2796     {
2797         size_t expectedLength = min(data1x10.length, n + 1);
2798         testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string,
2799                        "-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]);
2800 
2801         testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string,
2802                        "-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]);
2803 
2804         testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string,
2805                        fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]);
2806 
2807         testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string,
2808                        "-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]);
2809     }
2810 
2811     /* Simple random sampling with replacement: ensure sample size doesn't change order. */
2812     for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--)
2813     {
2814         testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6],
2815                       data3x6ExpectedReplaceNum10[0 .. n + 1]);
2816 
2817         testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader],
2818                       data3x6ExpectedReplaceNum10[1 .. n + 1]);
2819     }
2820 
2821     /* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */
2822     for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--)
2823     {
2824         size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1);
2825 
2826         testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
2827                        "-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]);
2828 
2829         testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
2830                        fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]);
2831 }
2832 
2833 
2834     /* Distinct sampling tests. */
2835     testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25],
2836                   data5x25ExpectedDistinctK2P40);
2837 
2838     testTsvSample(["test-j2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25],
2839                   data5x25ExpectedDistinctK2K4P20);
2840 
2841     testTsvSample(["test-j3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25],
2842                   data5x25ExpectedDistinctK2K3K4P20);
2843 
2844     testTsvSample(["test-j4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader],
2845                   data5x25ExpectedDistinctK2P40[1..$]);
2846 
2847     testTsvSample(["test-j5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader],
2848                   data5x25ExpectedDistinctK2K4P20[1..$]);
2849 
2850     testTsvSample(["test-j6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader],
2851                   data5x25ExpectedDistinctK2K3K4P20[1..$]);
2852 
2853 
2854     /* These distinct tests check that the whole line as '-k 0' and specifying all fields
2855      * in order have the same result. Also that field numbers don't matter, as '-k 1,2'
2856      * in data2x25 are the same keys as '-k 2,4' in data5x25.
2857      */
2858     testTsvSample(["test-j7", "-H", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25],
2859                   data2x25ExpectedDistinctK1K2P20);
2860 
2861     testTsvSample(["test-j8", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data2x25],
2862                   data2x25ExpectedDistinctK1K2P20);
2863 
2864     testTsvSample(["test-j9", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25_noheader],
2865                   data2x25ExpectedDistinctK1K2P20[1..$]);
2866 
2867     testTsvSample(["test-j10", "-s", "-p", "0.20", "-k", "0", fpath_data2x25_noheader],
2868                   data2x25ExpectedDistinctK1K2P20[1..$]);
2869 
2870     /* Similar to the last set, but for a 1-column file. Also with random value printing. */
2871     testTsvSample(["test-j11", "-H", "-s", "-p", "0.20", "-k", "1", fpath_data1x25],
2872                   data1x25ExpectedDistinctK1P20);
2873 
2874     testTsvSample(["test-j12", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data1x25],
2875                   data1x25ExpectedDistinctK1P20);
2876 
2877     testTsvSample(["test-j13", "-s", "-p", "0.20", "-k", "1", fpath_data1x25_noheader],
2878                   data1x25ExpectedDistinctK1P20[1..$]);
2879 
2880     testTsvSample(["test-j14", "-s", "-p", "0.20", "-k", "0", fpath_data1x25_noheader],
2881                   data1x25ExpectedDistinctK1P20[1..$]);
2882 
2883 
2884     testTsvSample(["test-j15", "-H", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25],
2885                   data1x25ExpectedDistinctK1P20Probs);
2886 
2887     testTsvSample(["test-j16", "-H", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25],
2888                   data1x25ExpectedDistinctK1P20Probs);
2889 
2890     testTsvSample(["test-j17", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25_noheader],
2891                   data1x25ExpectedDistinctK1P20Probs[1..$]);
2892 
2893     testTsvSample(["test-j18", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25_noheader],
2894                   data1x25ExpectedDistinctK1P20Probs[1..$]);
2895 
2896 
2897     testTsvSample(["test-j19", "-H", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25],
2898                   data1x25ExpectedDistinctK1P20ProbsInorder);
2899 
2900     testTsvSample(["test-j20", "-H", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25],
2901                   data1x25ExpectedDistinctK1P20ProbsInorder);
2902 
2903     testTsvSample(["test-j21", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25_noheader],
2904                   data1x25ExpectedDistinctK1P20ProbsInorder[1..$]);
2905 
2906     testTsvSample(["test-j22", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25_noheader],
2907                   data1x25ExpectedDistinctK1P20ProbsInorder[1..$]);
2908 
2909 }