1 /**
2 Command line tool for shuffling or sampling lines from input streams. Several methods
3 are available, including weighted and unweighted shuffling, simple and weighted random
4 sampling, sampling with replacement, Bernoulli sampling, and distinct sampling.
6 Copyright (c) 2017-2020, eBay Inc.
7 Initially written by Jon Degenhardt
9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
10 */
11 module tsv_utils.tsv_sample;
13 import std.array : appender, Appender, RefAppender;
14 import std.exception : enforce;
15 import std.format : format;
16 import std.range;
17 import std.stdio;
18 import std.typecons : tuple, Flag;
20 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
22 version(unittest)
23 {
24     // When running unit tests, use main from -main compiler switch.
25 }
26 else
27 {
28     /** Main program.
29      *
30      * Invokes command line argument processing and calls tsvSample to do the real
31      * work. Errors occurring during processing are caught and reported to the user.
32      */
33     int main(string[] cmdArgs)
34     {
35         /* When running in DMD code coverage mode, turn on report merging. */
36         version(D_Coverage) version(DigitalMars)
37         {
38             import core.runtime : dmd_coverSetMerge;
39             dmd_coverSetMerge(true);
40         }
42         TsvSampleOptions cmdopt;
43         const r = cmdopt.processArgs(cmdArgs);
44         if (!r[0]) return r[1];
45         version(LDC_Profile)
46         {
47             import ldc.profile : resetAll;
48             resetAll();
49         }
50         try
51         {
52             import tsv_utils.common.utils : BufferedOutputRange;
53             auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
55             tsvSample(cmdopt, bufferedOutput);
56         }
57         catch (Exception exc)
58         {
59             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
60             return 1;
61         }
62         return 0;
63     }
64 }
66 immutable helpText = q"EOS
67 Synopsis: tsv-sample [options] [file...]
69 Sample input lines or randomize their order. Several modes of operation
70 are available:
71 * Shuffling (the default): All input lines are output in random order. All
72   orderings are equally likely.
73 * Random sampling (--n|num N): A random sample of N lines are selected and
74   written to standard output. By default, selected lines are written in
75   random order. All sample sets and orderings are equally likely. Use
76   --i|inorder to write the selected lines in the original input order.
77 * Weighted random sampling (--n|num N, --w|weight-field F): A weighted
78   sample of N lines is produced. Weights are taken from field F. Lines are
79   output in weighted selection order. Use --i|inorder to write in original
80   input order. Omit --n|num to shuffle all lines (weighted shuffling).
81 * Sampling with replacement (--r|replace, --n|num N): All input lines are
82   read in, then lines are repeatedly selected at random and written out.
83   This continues until N lines are output. Individual lines can be written
84   multiple times. Output continues forever if N is zero or not provided.
85 * Bernoulli sampling (--p|prob P): A random subset of lines is selected
86   based on probability P, a 0.0-1.0 value. This is a streaming operation.
87   A decision is made on each line as it is read. Line order is not changed.
88 * Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled
89   based on the values in the key fields. A subset of keys are chosen based
90   on the inclusion probability (a 'distinct' set of keys). All lines with
91   one of the selected keys are output. Line order is not changed.
93 Use '--help-verbose' for detailed information.
95 Options:
96 EOS";
98 immutable helpTextVerbose = q"EOS
99 Synopsis: tsv-sample [options] [file...]
101 Sample input lines or randomize their order. Several modes of operation
102 are available:
103 * Shuffling (the default): All input lines are output in random order. All
104   orderings are equally likely.
105 * Random sampling (--n|num N): A random sample of N lines are selected and
106   written to standard output. By default, selected lines are written in
107   random order. All sample sets and orderings are equally likely. Use
108   --i|inorder to write the selected lines in the original input order.
109 * Weighted random sampling (--n|num N, --w|weight-field F): A weighted
110   sample of N lines is produced. Weights are taken from field F. Lines are
111   output in weighted selection order. Use --i|inorder to write in original
112   input order. Omit --n|num to shuffle all lines (weighted shuffling).
113 * Sampling with replacement (--r|replace, --n|num N): All input lines are
114   read in, then lines are repeatedly selected at random and written out.
115   This continues until N lines are output. Individual lines can be written
116   multiple times. Output continues forever if N is zero or not provided.
117 * Bernoulli sampling (--p|prob P): A random subset of lines is selected
118   based on probability P, a 0.0-1.0 value. This is a streaming operation.
119   A decision is made on each line as it is read. Line order is not changed.
120 * Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled
121   based on the values in the key fields. A subset of keys are chosen based
122   on the inclusion probability (a 'distinct' set of keys). All lines with
123   one of the selected keys are output. Line order is not changed.
125 Sample size: The '--n|num' option controls the sample size for all
126 sampling methods. In the case of simple and weighted random sampling it
127 also limits the amount of memory required.
129 Controlling the random seed: By default, each run produces a different
130 randomization or sampling. Using '--s|static-seed' changes this so
131 multiple runs produce the same results. This works by using the same
132 random seed each run. The random seed can be specified using
133 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero
134 value is a no-op and ignored.)
136 Memory use: Bernoulli sampling and distinct sampling make decisions on
137 each line as it is read, there is no memory accumulation. These algorithms
138 can run on arbitrary size inputs. Sampling with replacement reads all
139 lines into memory and is limited by available memory. Shuffling also reads
140 all lines into memory and is similarly limited. Random sampling uses
141 reservoir sampling, and only needs to hold the sample size (--n|num) in
142 memory. The input data can be of any length.
144 Weighted sampling: Weighted random sampling is done using an algorithm
145 described by Pavlos Efraimidis and Paul Spirakis. Weights should be
146 positive values representing the relative weight of the entry in the
147 collection. Counts and similar can be used as weights, it is *not*
148 necessary to normalize to a [0,1] interval. Negative values are not
149 meaningful and given the value zero. Input order is not retained, instead
150 lines are output ordered by the randomized weight that was assigned. This
151 means that a smaller valid sample can be produced by taking the first N
152 lines of output. For more info on the sampling approach see:
153 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling
154 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis
155   (https://arxiv.org/abs/1012.0256)
157 Printing random values: Most of the sampling algorithms work by generating
158 a random value for each line. (See "Compatibility mode" below.) The nature
159 of these values depends on the sampling algorithm. They are used for both
160 line selection and output ordering. The '--p|print-random' option can be
161 used to print these values. The random value is prepended to the line
162 separated by the --d|delimiter char (TAB by default). The
163 '--gen-random-inorder' option takes this one step further, generating
164 random values for all input lines without changing the input order. The
165 types of values currently used by these sampling algorithms:
166 * Unweighted sampling: Uniform random value in the interval [0,1]. This
167   includes Bernoulli sampling and unweighted line order randomization.
168 * Weighted sampling: Value in the interval [0,1]. Distribution depends on
169   the values in the weight field. It is used as a partial ordering.
170 * Distinct sampling: An integer, zero and up, representing a selection
171   group. The inclusion probability determines the number of selection groups.
172 * Sampling with replacement: Random value printing is not supported.
174 The specifics behind these random values are subject to change in future
175 releases.
177 Compatibility mode: As described above, many of the sampling algorithms
178 assign a random value to each line. This is useful when printing random
179 values. It has another occasionally useful property: repeated runs with
180 the same static seed but different selection parameters are more
181 compatible with each other, as each line gets assigned the same random
182 value on every run. For example, if Bernoulli sampling is run with
183 '--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed',
184 all the lines selected in the first run will be selected in the second.
185 This comes at a cost: in some cases there are faster algorithms that don't
186 preserve this property. By default, tsv-sample will use faster algorithms
187 when available. However, the '--compatibility-mode' option switches to
188 algorithms that assign a random value per line. Printing random values
189 also engages compatibility mode.
191 Options:
192 EOS";
194 /** Container for command line options and derived data.
195  *
196  * TsvSampleOptions handles several aspects of command line options. On the input side,
197  * it defines the command line options available, performs validation, and sets up any
198  * derived state based on the options provided. These activities are handled by the
199  * processArgs() member.
200  *
201  * Once argument processing is complete, TsvSampleOptions is used as a container
202  * holding the specific processing options used by the different sampling routines.
203  */
204 struct TsvSampleOptions
205 {
206     import tsv_utils.common.utils : InputSourceRange;
208     string programName;                        /// Program name
209     InputSourceRange inputSources;             /// Input files
210     bool helpVerbose = false;                  /// --help-verbose
211     bool hasHeader = false;                    /// --H|header
212     ulong sampleSize = 0;                      /// --n|num - Size of the desired sample
213     double inclusionProbability = double.nan;  /// --p|prob - Inclusion probability
214     size_t[] keyFields;                        /// --k|key-fields - Used with inclusion probability
215     size_t weightField = 0;                    /// --w|weight-field - Field holding the weight
216     bool srsWithReplacement = false;           /// --r|replace
217     bool preserveInputOrder = false;           /// --i|inorder
218     bool staticSeed = false;                   /// --s|static-seed
219     uint seedValueOptionArg = 0;               /// --v|seed-value
220     bool printRandom = false;                  /// --print-random
221     bool genRandomInorder = false;             /// --gen-random-inorder
222     string randomValueHeader = "random_value"; /// --random-value-header
223     bool compatibilityMode = false;            /// --compatibility-mode
224     char delim = '\t';                         /// --d|delimiter
225     bool versionWanted = false;                /// --V|version
226     bool preferSkipSampling = false;           /// --prefer-skip-sampling
227     bool preferAlgorithmR = false;             /// --prefer-algorithm-r
228     bool hasWeightField = false;               /// Derived.
229     bool useBernoulliSampling = false;         /// Derived.
230     bool useDistinctSampling = false;          /// Derived.
231     bool distinctKeyIsFullLine = false;        /// Derived. True if '--k|key-fields 0' is specfied.
232     bool usingUnpredictableSeed = true;        /// Derived from --static-seed, --seed-value
233     uint seed = 0;                             /// Derived from --static-seed, --seed-value
235     /** Process tsv-sample command line arguments.
236      *
237      * Defines the command line options, performs validation, and derives additional
238      * state. std.getopt.getopt is called to do the main option processing followed
239      * additional validation and derivation.
240      *
241      * Help text is printed to standard output if help was requested. Error text is
242      * written to stderr if invalid input is encountered.
243      *
244      * A tuple is returned. First value is true if command line arguments were
245      * successfully processed and execution should continue, or false if an error
246      * occurred or the user asked for help. If false, the second value is the
247      * appropriate exit code (0 or 1).
248      *
249      * Returning true (execution continues) means args have been validated and derived
250      * values calculated. Field indices will have been converted to zero-based.
251      */
252     auto processArgs(ref string[] cmdArgs)
253     {
254         import std.algorithm : all, canFind, each;
255         import std.getopt;
256         import std.math : isNaN;
257         import std.path : baseName, stripExtension;
258         import std.typecons : Yes, No;
259         import tsv_utils.common.utils : inputSourceRange, makeFieldListOptionHandler, ReadHeader;
261         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
263         try
264         {
265             arraySep = ",";    // Use comma to separate values in command line options
266             auto r = getopt(
267                 cmdArgs,
268                 "help-verbose",    "     Print more detailed help.", &helpVerbose,
270                 std.getopt.config.caseSensitive,
271                 "H|header",        "     Treat the first line of each file as a header.", &hasHeader,
272                 std.getopt.config.caseInsensitive,
274                 "n|num",           "NUM  Maximum number of lines to output. All selected lines are output if not provided or zero.", &sampleSize,
275                 "p|prob",          "NUM  Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability,
277                 "k|key-fields",    "<field-list>  Fields to use as key for distinct sampling. Use with '--p|prob'. Specify '--k|key-fields 0' to use the entire line as the key.",
278                 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
280                 "w|weight-field",  "NUM  Field containing weights. All lines get equal weight if not provided or zero.", &weightField,
281                 "r|replace",       "     Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement,
282                 "i|inorder",       "     Output random samples in original input order. Requires use of --n|num.", &preserveInputOrder,
283                 "s|static-seed",   "     Use the same random seed every run.", &staticSeed,
285                 std.getopt.config.caseSensitive,
286                 "v|seed-value",    "NUM  Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg,
287                 std.getopt.config.caseInsensitive,
289                 "print-random",       "     Include the assigned random value (prepended) when writing output lines.", &printRandom,
290                 "gen-random-inorder", "     Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder,
291                 "random-value-header",  "     Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader,
292                 "compatibility-mode", "     Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode,
294                 "d|delimiter",     "CHR  Field delimiter.", &delim,
296                 std.getopt.config.caseSensitive,
297                 "V|version",       "     Print version information and exit.", &versionWanted,
298                 std.getopt.config.caseInsensitive,
300                 "prefer-skip-sampling", "     (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.",
301                 &preferSkipSampling,
303                 "prefer-algorithm-r",   "     (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.",
304                 &preferAlgorithmR,
305                 );
307             if (r.helpWanted)
308             {
309                 defaultGetoptPrinter(helpText, r.options);
310                 return tuple(false, 0);
311             }
312             else if (helpVerbose)
313             {
314                 defaultGetoptPrinter(helpTextVerbose, r.options);
315                 return tuple(false, 0);
316             }
317             else if (versionWanted)
318             {
319                 import tsv_utils.common.tsvutils_version;
320                 writeln(tsvutilsVersionNotice("tsv-sample"));
321                 return tuple(false, 0);
322             }
324             /* Input files. Remaining command line args are files. */
325             string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
326             cmdArgs.length = 1;
327             ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
328             inputSources = inputSourceRange(filepaths, readHeader);
330             /* Derivations and validations. */
331             if (weightField > 0)
332             {
333                 hasWeightField = true;
334                 weightField--;    // Switch to zero-based indexes.
335             }
337             if (srsWithReplacement)
338             {
339                 enforce(!hasWeightField,
340                         "Sampling with replacement (--r|replace) does not support weights (--w|weight-field).");
342                 enforce(inclusionProbability.isNaN,
343                         "Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob).");
345                 enforce(keyFields.length == 0,
346                         "Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields).");
348                 enforce(!printRandom && !genRandomInorder,
349                         "Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder).");
351                 enforce(!preserveInputOrder,
352                         "Sampling with replacement (--r|replace) does not support input order preservation (--i|inorder option).");
353             }
355             if (keyFields.length > 0)
356             {
357                 /* Note: useDistinctSampling is set as part of the inclusion probability checks below. */
359                 enforce(!inclusionProbability.isNaN, "--p|prob is required when using --k|key-fields.");
361                 if (keyFields.length == 1 && keyFields[0] == 0)
362                 {
363                     distinctKeyIsFullLine = true;
364                 }
365                 else
366                 {
367                     enforce(keyFields.length <= 1 || keyFields.all!(x => x != 0),
368                             "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.");
370                     keyFields.each!((ref x) => --x);  // Convert to zero-based indexing.
371                 }
372             }
374             /* Inclusion probability (--p|prob) is used for both Bernoulli sampling and distinct sampling. */
375             if (!inclusionProbability.isNaN)
376             {
377                 enforce(inclusionProbability > 0.0 && inclusionProbability <= 1.0,
378                         format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability));
380                 if (keyFields.length > 0) useDistinctSampling = true;
381                 else useBernoulliSampling = true;
383                 enforce(!hasWeightField, "--w|weight-field and --p|prob cannot be used together.");
385                 enforce(!genRandomInorder || useDistinctSampling,
386                         "--gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used." ~
387                         "\nUse --gen-random-inorder alone to print probabilities for all lines." ~
388                         "\nUse --p|prob and --print-random to print probabilities for lines satisfying the probability threshold.");
389             }
390             else if (genRandomInorder && !hasWeightField)
391             {
392                 useBernoulliSampling = true;
393             }
395             enforce(randomValueHeader.length != 0 && !randomValueHeader.canFind('\n') &&
396                     !randomValueHeader.canFind(delim),
397                     "--randomValueHeader must be at least one character and not contain field delimiters or newlines.");
399             /* Check for incompatible use of (--i|inorder) and shuffling of the full
400              * data set. Sampling with replacement is also incompatible, this is
401              * detected earlier. Shuffling is the default operation, so it identified
402              * by eliminating the other modes of operation.
403              */
404             enforce(!preserveInputOrder ||
405                     sampleSize != 0 ||
406                     useBernoulliSampling ||
407                     useDistinctSampling,
408                     "Preserving input order (--i|inorder) is not compatible with full data set shuffling. Switch to random sampling with a sample size (--n|num) to use --i|inorder.");
410             /* Compatibility mode checks:
411              * - Random value printing implies compatibility-mode, otherwise user's
412              *   selection is used.
413              * - Distinct sampling doesn't support compatibility-mode. The routines
414              *   don't care, but users might expect larger probabilities to be a
415              *   superset of smaller probabilities. This would be confusing, so
416              *   flag it as an error.
417              */
418             enforce(!(compatibilityMode && useDistinctSampling),
419                     "Distinct sampling (--k|key-fields --p|prob) does not support --compatibility-mode.");
421             if (printRandom || genRandomInorder) compatibilityMode = true;
423             /* Seed. */
424             import std.random : unpredictableSeed;
426             usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0);
428             if (usingUnpredictableSeed) seed = unpredictableSeed;
429             else if (seedValueOptionArg != 0) seed = seedValueOptionArg;
430             else if (staticSeed) seed = 2438424139;
431             else assert(0, "Internal error, invalid seed option states.");
432         }
433         catch (Exception exc)
434         {
435             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
436             return tuple(false, 1);
437         }
438         return tuple(true, 0);
439     }
440 }
441 /** Invokes the appropriate sampling routine based on the command line arguments.
442  *
443  * tsvSample is the top-level routine handling the different tsv-sample use cases.
444  * Its primary role is to invoke the correct routine for type of sampling requested.
445  */
446 void tsvSample(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
447 if (isOutputRange!(OutputRange, char))
448 {
449     if (cmdopt.srsWithReplacement)
450     {
451         simpleRandomSamplingWithReplacement(cmdopt, outputStream);
452     }
453     else if (cmdopt.useBernoulliSampling)
454     {
455         bernoulliSamplingCommand(cmdopt, outputStream);
456     }
457     else if (cmdopt.useDistinctSampling)
458     {
459         if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
460         else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream);
461     }
462     else if (cmdopt.genRandomInorder)
463     {
464         /* Note that the preceding cases handle gen-random-inorder themselves (Bernoulli,
465          * Distinct), or don't handle it (SRS w/ Replacement).
466          */
467         assert(cmdopt.hasWeightField);
468         generateWeightedRandomValuesInorder(cmdopt, outputStream);
469     }
470     else if (cmdopt.sampleSize != 0)
471     {
472         randomSamplingCommand(cmdopt, outputStream);
473     }
474     else
475     {
476         shuffleCommand(cmdopt, outputStream);
477     }
478 }
480 /** Bernoulli sampling command handler. Invokes the appropriate Bernoulli sampling
481  * routine based on the command line arguments.
482  *
483  * This routine selects the appropriate Bernoulli sampling function and template
484  * instantiation to use based on the command line arguments.
485  *
486  * One of the basic choices is whether to use the vanilla algorithm or skip sampling.
487  * Skip sampling is a little bit faster when the inclusion probability is small but
488  * doesn't support compatibility mode. See the bernoulliSkipSampling documentation
489  * for a discussion of the skipSamplingProbabilityThreshold used here.
490  */
491 void bernoulliSamplingCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
492 if (isOutputRange!(OutputRange, char))
493 {
494     assert(!cmdopt.hasWeightField);
496     immutable double skipSamplingProbabilityThreshold = 0.04;
498     if (cmdopt.compatibilityMode ||
499         (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling))
500     {
501         if (cmdopt.genRandomInorder)
502         {
503             bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
504         }
505         else
506         {
507             bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream);
508         }
509     }
510     else
511     {
512         bernoulliSkipSampling(cmdopt, outputStream);
513     }
514 }
516 /** Bernoulli sampling of lines from the input stream.
517  *
518  * Each input line is a assigned a random value and output if less than
519  * cmdopt.inclusionProbability. The order of the lines is not changed.
520  *
521  * This routine supports random value printing and gen-random-inorder value printing.
522  */
523 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
524     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
525 if (isOutputRange!(OutputRange, char))
526 {
527     import std.random : Random = Mt19937, uniform01;
528     import tsv_utils.common.utils : bufferedByLine, InputSourceRange, throwIfWindowsNewlineOnUnix;
530     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
531     else assert(!cmdopt.genRandomInorder);
533     assert(!cmdopt.inputSources.empty);
534     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
536     auto randomGenerator = Random(cmdopt.seed);
538     /* First header is read during command line argument processing. */
539     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
540     {
541         auto inputStream = cmdopt.inputSources.front;
542         throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
544         static if (generateRandomAll)
545         {
546             outputStream.put(cmdopt.randomValueHeader);
547             outputStream.put(cmdopt.delim);
548         }
549         else if (cmdopt.printRandom)
550         {
551             outputStream.put(cmdopt.randomValueHeader);
552             outputStream.put(cmdopt.delim);
553         }
555         outputStream.put(inputStream.header);
556         outputStream.put("\n");
557     }
559     /* Process each line. */
560     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
561     ulong numLinesWritten = 0;
563     foreach (inputStream; cmdopt.inputSources)
564     {
565         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
567         foreach (ulong fileLineNum, line;
568                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
569         {
570             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
572             immutable double lineScore = uniform01(randomGenerator);
574             static if (generateRandomAll)
575             {
576                 outputStream.formatRandomValue(lineScore);
577                 outputStream.put(cmdopt.delim);
578                 outputStream.put(line);
579                 outputStream.put("\n");
581                 if (cmdopt.sampleSize != 0)
582                 {
583                     ++numLinesWritten;
584                     if (numLinesWritten == cmdopt.sampleSize) return;
585                 }
586             }
587             else if (lineScore < cmdopt.inclusionProbability)
588             {
589                 if (cmdopt.printRandom)
590                 {
591                     outputStream.formatRandomValue(lineScore);
592                     outputStream.put(cmdopt.delim);
593                 }
594                 outputStream.put(line);
595                 outputStream.put("\n");
597                 if (cmdopt.sampleSize != 0)
598                 {
599                     ++numLinesWritten;
600                     if (numLinesWritten == cmdopt.sampleSize) return;
601                 }
602             }
603         }
604     }
605 }
607 /** bernoulliSkipSampling is an implementation of Bernoulli sampling using skips.
608  *
609  * Skip sampling works by skipping a random number of lines between selections. This
610  * can be faster than assigning a random value to each line when the inclusion
611  * probability is low, as it reduces the number of calls to the random number
612  * generator. Both the random number generator and the log() function are called when
613  * calculating the next skip size. These additional log() calls add up as the
614  * inclusion probability increases.
615  *
616  * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for
617  * file-oriented line sampling. This is obviously environment specific. In the
618  * environments this implementation has been tested in the performance improvements
619  * remain small, less than 7%, even with an inclusion probability as low as 0.0001.
620  *
621  * The algorithm does not assign random values to individual lines. This makes it
622  * incompatible with random value printing. It is not suitable for compatibility mode
623  * either. As an example, in compatibility mode a line selected with '--prob 0.2' should
624  * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling
625  * does not have this property.
626  *
627  * The algorithm for calculating the skip size has been described by multiple sources.
628  * There are two key variants depending on whether the total number of lines in the
629  * data set is known in advance. (This implementation does not know the total.)
630  * Useful references:
631  * $(LIST
632  *     * Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling",
633  *       ACM Trans on Mathematical Software, 1987. On-line:
634  *       http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf
635  *     * P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book
636  *       "Data Stream Management", Springer-Verlag, 2016. On-line:
637  *       https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf
638  *     * Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line:
639  *       http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/
640  * )
641  */
642 void bernoulliSkipSampling(OutputRange)(ref TsvSampleOptions cmdopt, OutputRange outputStream)
643     if (isOutputRange!(OutputRange, char))
644 {
645     import std.conv : to;
646     import std.math : log, trunc;
647     import std.random : Random = Mt19937, uniform01;
648     import tsv_utils.common.utils : bufferedByLine, InputSourceRange, throwIfWindowsNewlineOnUnix;
650     assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0);
651     assert(!cmdopt.printRandom);
652     assert(!cmdopt.compatibilityMode);
654     assert(!cmdopt.inputSources.empty);
655     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
657     auto randomGenerator = Random(cmdopt.seed);
659     immutable double discardRate = 1.0 - cmdopt.inclusionProbability;
660     immutable double logDiscardRate = log(discardRate);
662     /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed
663      * interval to (0.0, 1.0], excluding 0.0.
664      */
665     size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
667     /* First header is read during command line argument processing. */
668     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
669     {
670         auto inputStream = cmdopt.inputSources.front;
671         throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
673         outputStream.put(inputStream.header);
674         outputStream.put("\n");
675     }
677     /* Process each line. */
678     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
679     ulong numLinesWritten = 0;
680     foreach (inputStream; cmdopt.inputSources)
681     {
682         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
684         foreach (ulong fileLineNum, line;
685                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
686         {
687             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
689             if (remainingSkips > 0)
690             {
691                 --remainingSkips;
692             }
693             else
694             {
695                 outputStream.put(line);
696                 outputStream.put("\n");
698                 if (cmdopt.sampleSize != 0)
699                 {
700                     ++numLinesWritten;
701                     if (numLinesWritten == cmdopt.sampleSize) return;
702                 }
704                 remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
705             }
706         }
707     }
708 }
710 /** Sample lines by choosing a random set of distinct keys formed from one or more
711  * fields on each line.
712  *
713  * Distinct sampling is a streaming form of sampling, similar to Bernoulli sampling.
714  * However, instead of each line being subject to an independent trial, lines are
715  * selected based on a key from each line. A portion of keys are randomly selected for
716  * output, and every line containing a selected key is included in the output.
717  *
718  * An example use-case is a query log having <user, query, clicked-url> triples. It is
719  * often useful to sample records for portion of the users, but including all records
720  * for the users selected. Distinct sampling supports this by selecting a subset of
721  * users to include in the output.
722  *
723  * Distinct sampling is done by hashing the key and mapping the hash value into
724  * buckets sized to hold the inclusion probability. Records having a key mapping to
725  * bucket zero are output. Buckets are equal size and therefore may be larger than the
726  * inclusion probability. (The other approach would be to have the caller specify the
727  * the number of buckets. More correct, but less convenient.)
728  */
729 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
730     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
731 if (isOutputRange!(OutputRange, char))
732 {
733     import std.algorithm : splitter;
734     import std.conv : to;
735     import std.digest.murmurhash;
736     import std.math : lrint;
737     import tsv_utils.common.utils : bufferedByLine, InputFieldReordering,
738         InputSourceRange, throwIfWindowsNewlineOnUnix;
740     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
741     else assert(!cmdopt.genRandomInorder);
743     assert(cmdopt.keyFields.length > 0);
744     assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0);
746     assert(!cmdopt.inputSources.empty);
747     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
749     static if (generateRandomAll)
750     {
751         import std.format : formatValue, singleSpec;
752         immutable randomValueFormatSpec = singleSpec("%d");
753     }
755     immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys.
757     uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint;
759     /* Create a mapping for the key fields. */
760     auto keyFieldsReordering = cmdopt.distinctKeyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields);
762     /* First header is read during command line argument processing. */
763     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
764     {
765         auto inputStream = cmdopt.inputSources.front;
766         throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
768         static if (generateRandomAll)
769         {
770             outputStream.put(cmdopt.randomValueHeader);
771             outputStream.put(cmdopt.delim);
772         }
773         else if (cmdopt.printRandom)
774         {
775             outputStream.put(cmdopt.randomValueHeader);
776             outputStream.put(cmdopt.delim);
777         }
779         outputStream.put(inputStream.header);
780         outputStream.put("\n");
781     }
783     /* Process each line. */
784     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
785     ulong numLinesWritten = 0;
787     foreach (inputStream; cmdopt.inputSources)
788     {
789         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
791         foreach (ulong fileLineNum, line;
792                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
793         {
794             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
796             /* Murmurhash works by successively adding individual keys, then finalizing.
797              * Adding individual keys is simpler if the full-line-as-key and individual
798              * fields as keys cases are separated.
799              */
800             auto hasher = MurmurHash3!32(cmdopt.seed);
802             if (cmdopt.distinctKeyIsFullLine)
803             {
804                 hasher.put(cast(ubyte[]) line);
805             }
806             else
807             {
808                 assert(keyFieldsReordering !is null);
810                 /* Gather the key field values and assemble the key. */
811                 keyFieldsReordering.initNewLine;
812                 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
813                 {
814                     keyFieldsReordering.processNextField(fieldIndex, fieldValue);
815                     if (keyFieldsReordering.allFieldsFilled) break;
816                 }
818                 enforce(keyFieldsReordering.allFieldsFilled,
819                         format("Not enough fields in line. File: %s, Line: %s",
820                                inputStream.name, fileLineNum));
822                 foreach (count, key; keyFieldsReordering.outputFields.enumerate)
823                 {
824                     if (count > 0) hasher.put(delimArray);
825                     hasher.put(cast(ubyte[]) key);
826                 }
827             }
829             hasher.finish;
831             static if (generateRandomAll)
832             {
833                 import std.conv : to;
834                 outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec);
835                 outputStream.put(cmdopt.delim);
836                 outputStream.put(line);
837                 outputStream.put("\n");
839                 if (cmdopt.sampleSize != 0)
840                 {
841                     ++numLinesWritten;
842                     if (numLinesWritten == cmdopt.sampleSize) return;
843                 }
844             }
845             else if (hasher.get % numBuckets == 0)
846             {
847                 if (cmdopt.printRandom)
848                 {
849                     outputStream.put('0');
850                     outputStream.put(cmdopt.delim);
851                 }
852                 outputStream.put(line);
853                 outputStream.put("\n");
855                 if (cmdopt.sampleSize != 0)
856                 {
857                     ++numLinesWritten;
858                     if (numLinesWritten == cmdopt.sampleSize) return;
859                 }
860             }
861         }
862     }
863 }
865 /** Random sampling command handler. Invokes the appropriate sampling routine based on
866  * the command line arguments.
867  *
868  * Random sampling selects a fixed size random sample from the input stream. Both
869  * simple random sampling (equal likelihood) and weighted random sampling are
870  * supported. Selected lines are output either in random order or original input order.
871  * For weighted sampling the random order is the weighted selection order.
872  *
873  * Two algorithms are used, reservoir sampling via a heap and reservoir sampling via
874  * Algorithm R. This routine selects the appropriate reservoir sampling function and
875  * template instantiation to based on the command line arguments.
876  *
877  * Weighted sampling always uses the heap approach. Compatibility mode does as well,
878  * as it is the method that uses per-line random value assignments. The implication
879  * of compatibility mode is that a larger sample size includes all the results from
880  * a smaller sample, assuming the same random seed is used.
881  *
882  * For unweighted sampling there is a performance tradeoff between implementations.
883  * Heap-based sampling is faster for small sample sizes. Algorithm R is faster for
884  * large sample sizes. The threshold used was chosen based on performance tests. See
885  * the reservoirSamplingAlgorithmR documentation for more information.
886  */
888 void randomSamplingCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
889 if (isOutputRange!(OutputRange, char))
890 {
891     assert(cmdopt.sampleSize != 0);
893     immutable size_t algorithmRSampleSizeThreshold = 128 * 1024;
895     if (cmdopt.hasWeightField)
896     {
897         if (cmdopt.preserveInputOrder)
898         {
899             reservoirSamplingViaHeap!(Yes.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream);
900         }
901         else
902         {
903             reservoirSamplingViaHeap!(Yes.isWeighted, No.preserveInputOrder)(cmdopt, outputStream);
904         }
905     }
906     else if (cmdopt.compatibilityMode ||
907              (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR))
908     {
909         if (cmdopt.preserveInputOrder)
910         {
911             reservoirSamplingViaHeap!(No.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream);
912         }
913         else
914         {
915             reservoirSamplingViaHeap!(No.isWeighted, No.preserveInputOrder)(cmdopt, outputStream);
916         }
917     }
918     else if (cmdopt.preserveInputOrder)
919     {
920         reservoirSamplingAlgorithmR!(Yes.preserveInputOrder)(cmdopt, outputStream);
921     }
922     else
923     {
924         reservoirSamplingAlgorithmR!(No.preserveInputOrder)(cmdopt, outputStream);
925     }
926 }
928 /** Reservoir sampling using a heap. Both weighted and unweighted random sampling are
929  * supported.
930  *
931  * The algorithm used here is based on the one-pass algorithm described by Pavlos
932  * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S.
933  * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are
934  * simply set to one.
935  *
936  * The implementation uses a heap (priority queue) large enough to hold the desired
937  * number of lines. Input is read line-by-line, assigned a random value, and added to
938  * the heap. The role of the heap is to identify the lines with the highest assigned
939  * random values. Once the heap is full, adding a new line means dropping the line with
940  * the lowest score. A "min" heap used for this reason.
941  *
942  * When done reading all lines, the "min" heap is in reverse of weighted selection
943  * order. Weighted selection order is obtained by removing each element one at at time
944  * from the heap. The underlying data store will have the elements in weighted selection
945  * order (largest weights first).
946  *
947  * Generating output in weighted order is useful for several reasons:
948  *  - For weighted sampling, it preserves the property that smaller valid subsets can be
949  *    created by taking the first N lines.
950  *  - For unweighted sampling, it ensures that all output permutations are possible, and
951  *    are not influenced by input order or the heap data structure used.
952  *  - Order consistency is maintained when making repeated use of the same random seed,
953  *    but with different sample sizes.
954  *
955  * The other choice is preserving input order. This is supporting by recording line
956  * numbers and sorting the selected sample.
957  *
958  * There are use cases where only the selection set matters. For these some performance
959  * could be gained by skipping the reordering and simply printing the backing store
960  * array in-order. Performance tests indicate only a minor benefit, so this is not
961  * supported.
962  *
963  * Notes:
964  * $(LIST
965  *    * In tsv-sample versions 1.2.1 and earlier this routine also supported
966  *      randomization of all input lines. This was dropped in version 1.2.2 in favor
967  *      of the approach used in randomizeLines. The latter has significant advantages
968  *      given that all data must be read into memory.
969  *    * For large reservoir sizes better performance can be achieved using Algorithm R.
970  *      See the reservoirSamplingAlgorithmR documentation for details.
971  * )
972  */
973 void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, Flag!"preserveInputOrder" preserveInputOrder, OutputRange)
974     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
975 if (isOutputRange!(OutputRange, char))
976 {
977     import std.algorithm : sort;
978     import std.container.array;
979     import std.container.binaryheap;
980     import std.meta : AliasSeq;
981     import std.random : Random = Mt19937, uniform01;
982     import tsv_utils.common.utils : bufferedByLine, InputSourceRange, throwIfWindowsNewlineOnUnix;
984     static if (isWeighted) assert(cmdopt.hasWeightField);
985     else assert(!cmdopt.hasWeightField);
987     assert(cmdopt.sampleSize > 0);
989     assert(!cmdopt.inputSources.empty);
990     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
992     auto randomGenerator = Random(cmdopt.seed);
994     static struct Entry(Flag!"preserveInputOrder" preserveInputOrder)
995     {
996         double score;
997         const(char)[] line;
998         static if (preserveInputOrder) ulong lineNumber;
999     }
1001     /* Create the heap and backing data store.
1002      *
1003      * Note: An std.container.array is used as the backing store to avoid some issues in
1004      * the standard library (Phobos) binaryheap implementation. Specifically, when an
1005      * std.container.array is used as backing store, the heap can efficiently reversed by
1006      * removing the heap elements. This leaves the backing store in the reversed order.
1007      * However, the current binaryheap implementation does not support this for all
1008      * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094.
1009      */
1011     Array!(Entry!preserveInputOrder) dataStore;
1012     dataStore.reserve(cmdopt.sampleSize);
1013     auto reservoir = dataStore.heapify!("a.score > b.score")(0);  // Min binaryheap
1015     /* First header is read during command line argument processing. */
1016     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1017     {
1018         auto inputStream = cmdopt.inputSources.front;
1019         throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1021         if (cmdopt.printRandom)
1022         {
1023             outputStream.put(cmdopt.randomValueHeader);
1024             outputStream.put(cmdopt.delim);
1025         }
1026         outputStream.put(inputStream.header);
1027         outputStream.put("\n");
1028     }
1030     /* Process each line. */
1031     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
1032     static if (preserveInputOrder) ulong totalLineNum = 0;
1034     foreach (inputStream; cmdopt.inputSources)
1035     {
1036         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1038         foreach (ulong fileLineNum, line;
1039                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
1040         {
1041             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
1043             static if (!isWeighted)
1044             {
1045                 immutable double lineScore = uniform01(randomGenerator);
1046             }
1047             else
1048             {
1049                 immutable double lineWeight =
1050                     getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, inputStream.name, fileLineNum);
1051                 immutable double lineScore =
1052                     (lineWeight > 0.0)
1053                     ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1054                     : 0.0;
1055             }
1057             static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum);
1058             else alias entryCTArgs = AliasSeq!();
1060             if (reservoir.length < cmdopt.sampleSize)
1061             {
1062                 reservoir.insert(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs));
1063             }
1064             else if (reservoir.front.score < lineScore)
1065             {
1066                 reservoir.replaceFront(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs));
1067             }
1069             static if (preserveInputOrder) ++totalLineNum;
1070         }
1071     }
1073     /* Done with input, all entries are in the reservoir. */
1075     /* The asserts here avoid issues with the current binaryheap implementation. They
1076      * detect use of backing stores having a length not synchronized to the reservoir.
1077      */
1078     immutable ulong numLines = reservoir.length;
1079     assert(numLines == dataStore.length);
1081     /* Update the backing store so it is in the desired output order.
1082      */
1083     static if (preserveInputOrder)
1084     {
1085         dataStore[].sort!((a, b) => a.lineNumber < b.lineNumber);
1086     }
1087     else
1088     {
1089         /* Output in weighted selection order. The heap is in reverse order of assigned
1090          * weights. Reversing order is done by removing all elements from the heap. This
1091          * leaves the backing store in the correct order.
1092          */
1093         while (!reservoir.empty) reservoir.removeFront;
1094     }
1096     assert(numLines == dataStore.length);
1098     foreach (entry; dataStore)
1099     {
1100         if (cmdopt.printRandom)
1101         {
1102             outputStream.formatRandomValue(entry.score);
1103             outputStream.put(cmdopt.delim);
1104         }
1105         outputStream.put(entry.line);
1106         outputStream.put("\n");
1107     }
1108  }
1110 /** Generate weighted random values for all input lines, preserving input order.
1111  *
1112  * This complements weighted reservoir sampling, but instead of using a reservoir it
1113  * simply iterates over the input lines generating the values. The weighted random
1114  * values are generated with the same formula used by reservoirSampling.
1115  */
1116 void generateWeightedRandomValuesInorder(OutputRange)
1117     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1118 if (isOutputRange!(OutputRange, char))
1119 {
1120     import std.random : Random = Mt19937, uniform01;
1121     import tsv_utils.common.utils : bufferedByLine, InputSourceRange, throwIfWindowsNewlineOnUnix;
1123     assert(cmdopt.hasWeightField);
1125     assert(!cmdopt.inputSources.empty);
1126     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1128     auto randomGenerator = Random(cmdopt.seed);
1130     /* First header is read during command line argument processing. */
1131     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1132     {
1133         auto inputStream = cmdopt.inputSources.front;
1134         throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1136         outputStream.put(cmdopt.randomValueHeader);
1137         outputStream.put(cmdopt.delim);
1138         outputStream.put(inputStream.header);
1139         outputStream.put("\n");
1140     }
1142     /* Process each line. */
1143     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
1144     ulong numLinesWritten = 0;
1146     foreach (inputStream; cmdopt.inputSources)
1147     {
1148         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1150         foreach (ulong fileLineNum, line;
1151                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
1152         {
1153             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
1155             immutable double lineWeight =
1156                 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, inputStream.name, fileLineNum);
1158             immutable double lineScore =
1159                 (lineWeight > 0.0)
1160                 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1161                 : 0.0;
1163             outputStream.formatRandomValue(lineScore);
1164             outputStream.put(cmdopt.delim);
1165             outputStream.put(line);
1166             outputStream.put("\n");
1168             if (cmdopt.sampleSize != 0)
1169             {
1170                 ++numLinesWritten;
1171                 if (numLinesWritten == cmdopt.sampleSize) return;
1172             }
1173         }
1174     }
1175 }
1177 /** Reservoir sampling via Algorithm R
1178  *
1179  * This is an implementation of reservoir sampling using what is commonly known as
1180  * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of
1181  * Computer Programming, Volume 2: Seminumerical Algorithms". More information about
1182  * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with
1183  * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling"
1184  * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R).
1185  *
1186  * Algorithm R is used for unweighted sampling without replacement. The heap-based
1187  * algorithm in reservoirSamplingViaHeap is used for weighted sampling.
1188  *
1189  * The classic algorithm stops after identifying the selected set of items. This
1190  * implementation goes one step further and randomizes the order of the selected
1191  * lines. This is consistent with shuffling (line order randomization), a primary
1192  * tsv-sample use-case.
1193  *
1194  * This algorithm is faster than reservoirSamplingViaHeap when the sample size
1195  * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size.
1196  * Insertion in this algorithm is O(1). Similarly, generating the random order in the
1197  * heap is O(k * log k), while in this algorithm the final randomization step is O(k).
1198  *
1199  * This speed advantage may be offset a certain amount by using a more expensive random
1200  * value generator. reservoirSamplingViaHeap generates values between zero and one,
1201  * whereas reservoirSamplingAlgorithmR generates random integers over and ever growing
1202  * interval. The latter is expected to be more expensive. This is consistent with
1203  * performance tests indicating that reservoirSamplingViaHeap is faster when using
1204  * small-to-medium size reservoirs and large input streams.
1205  */
1206 void reservoirSamplingAlgorithmR(Flag!"preserveInputOrder" preserveInputOrder, OutputRange)
1207     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1208 if (isOutputRange!(OutputRange, char))
1209 {
1210     import std.meta : AliasSeq;
1211     import std.random : Random = Mt19937, randomShuffle, uniform;
1212     import std.algorithm : sort;
1213     import tsv_utils.common.utils : bufferedByLine, InputSourceRange, throwIfWindowsNewlineOnUnix;
1215     assert(cmdopt.sampleSize > 0);
1216     assert(!cmdopt.hasWeightField);
1217     assert(!cmdopt.compatibilityMode);
1218     assert(!cmdopt.printRandom);
1219     assert(!cmdopt.genRandomInorder);
1221     assert(!cmdopt.inputSources.empty);
1222     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1224     static struct Entry(Flag!"preserveInputOrder" preserveInputOrder)
1225     {
1226         const(char)[] line;
1227         static if (preserveInputOrder) ulong lineNumber;
1228     }
1230     Entry!preserveInputOrder[] reservoir;
1231     auto reservoirAppender = appender(&reservoir);
1232     reservoirAppender.reserve(cmdopt.sampleSize);
1234     auto randomGenerator = Random(cmdopt.seed);
1236     /* First header is read during command line argument processing. */
1237     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1238     {
1239         auto inputStream = cmdopt.inputSources.front;
1240         throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1242         outputStream.put(inputStream.header);
1243         outputStream.put("\n");
1244     }
1246     /* Process each line. */
1247     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
1248     ulong totalLineNum = 0;
1250     foreach (inputStream; cmdopt.inputSources)
1251     {
1252         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1254         foreach (ulong fileLineNum, line;
1255                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
1256         {
1257             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
1259             /* Add lines to the reservoir until the reservoir is filled.
1260              * After that lines are added with decreasing likelihood, based on
1261              * the total number of lines seen. If added to the reservoir, the
1262              * line replaces a randomly chosen existing line.
1263              */
1264             static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum);
1265             else alias entryCTArgs = AliasSeq!();
1267             if (totalLineNum < cmdopt.sampleSize)
1268             {
1269                 reservoirAppender ~= Entry!preserveInputOrder(line.idup, entryCTArgs);
1270             }
1271             else
1272             {
1273                 immutable size_t i = uniform(0, totalLineNum, randomGenerator);
1274                 if (i < reservoir.length)
1275                 {
1276                     reservoir[i] = Entry!preserveInputOrder(line.idup, entryCTArgs);
1277                 }
1278             }
1280             ++totalLineNum;
1281         }
1282     }
1284     /* Done with input. The sample is in the reservoir. Update the order and print. */
1286     static if (preserveInputOrder)
1287     {
1288         reservoir.sort!((a, b) => a.lineNumber < b.lineNumber);
1289     }
1290     else
1291     {
1292         reservoir.randomShuffle(randomGenerator);
1293     }
1295     foreach (ref entry; reservoir)
1296     {
1297         outputStream.put(entry.line);
1298         outputStream.put("\n");
1299     }
1300 }
1302 /** Shuffling command handler. Invokes the appropriate shuffle (line order
1303  * randomization) routine based on the command line arguments.
1304  *
1305  * Shuffling has similarities to random sampling, but the algorithms used are
1306  * different. Random sampling selects a subset, only the current subset selection
1307  * needs to be kept in memory. This is supported by reservoir sampling. By contrast,
1308  * shuffling needs to hold all input in memory, so it works better to read all lines
1309  * into memory at once and then shuffle.
1310  *
1311  * Two different algorithms are used. Array shuffling is used for unweighted shuffling.
1312  * Sorting plus random weight assignments is used for weighted shuffling and when
1313  * compatibility mode is being used.
1314  *
1315  * The algorithms used here are all limited by available memory.
1316  */
1317 void shuffleCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1318 if (isOutputRange!(OutputRange, char))
1319 {
1320     if (cmdopt.hasWeightField)
1321     {
1322         randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream);
1323     }
1324     else if (cmdopt.compatibilityMode)
1325     {
1326         randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream);
1327     }
1328     else
1329     {
1330         randomizeLinesViaShuffle(cmdopt, outputStream);
1331     }
1332 }
1334 /** Shuffle all input lines by assigning random weights and sorting.
1335  *
1336  * randomizeLinesViaSort reads in all input lines and writes them out in random order.
1337  * The algorithm works by assigning a random value to each line and sorting. Both
1338  * weighted and unweighted shuffling are supported.
1339  *
1340  * Notes:
1341  * $(LIST
1342  *   * For unweighted shuffling randomizeLinesViaShuffle is faster and should be used
1343  *     unless compatibility mode is needed.
1344  *   * This routine is significantly faster than heap-based reservoir sampling in the
1345  *     case where the entire file is being read.
1346  *   * Input data must be read entirely in memory. Disk oriented techniques are needed
1347  *     when data sizes get too large for available memory. One option is to generate
1348  *     random values for each line, e.g. --gen-random-inorder, and sort with a disk-
1349  *     backed sort program like GNU sort.
1350  * )
1351  */
1352 void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange)
1353     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1354 if (isOutputRange!(OutputRange, char))
1355 {
1356     import std.algorithm : map, sort;
1358     static if (isWeighted) assert(cmdopt.hasWeightField);
1359     else assert(!cmdopt.hasWeightField);
1361     assert(cmdopt.sampleSize == 0);
1363     /*
1364      * Read all file data into memory. Then split the data into lines and assign a
1365      * random value to each line. readFileData also writes the first header line.
1366      */
1367     const fileData = readFileData!(Yes.hasRandomValue)(cmdopt, outputStream);
1368     auto inputLines = fileData.identifyInputLines!(Yes.hasRandomValue, isWeighted)(cmdopt);
1370     /*
1371      * Sort by the weight and output the lines.
1372      */
1373     inputLines.sort!((a, b) => a.randomValue > b.randomValue);
1375     foreach (lineEntry; inputLines)
1376     {
1377         if (cmdopt.printRandom)
1378         {
1379             outputStream.formatRandomValue(lineEntry.randomValue);
1380             outputStream.put(cmdopt.delim);
1381         }
1382         outputStream.put(lineEntry.data);
1383         outputStream.put("\n");
1384     }
1385 }
1387 /** Shuffle (randomize) all input lines using a shuffling algorithm.
1388  *
1389  * All lines in files and/or standard input are read in and written out in random
1390  * order. This routine uses array shuffling, which is faster than sorting. It is a
1391  * good alternative to randomizeLinesViaSort when doing unweighted shuffling (the
1392  * most common case).
1393  *
1394  * Input data size is limited by available memory. Disk oriented techniques are needed
1395  * when data sizes are larger. For example, generating random values line-by-line (ala
1396  * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort.
1397  *
1398  * This routine does not support random value printing or compatibility-mode.
1399  */
1400 void randomizeLinesViaShuffle(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1401 if (isOutputRange!(OutputRange, char))
1402 {
1403     import std.algorithm : map;
1404     import std.random : Random = Mt19937, randomShuffle;
1406     assert(cmdopt.sampleSize == 0);
1407     assert(!cmdopt.hasWeightField);
1408     assert(!cmdopt.printRandom);
1409     assert(!cmdopt.genRandomInorder);
1411     /*
1412      * Read all file data into memory and split into lines.
1413      */
1414     const fileData = readFileData!(No.hasRandomValue)(cmdopt, outputStream);
1415     auto inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt);
1417     /*
1418      * Randomly shuffle and print each line.
1419      *
1420      * Note: Also tried randomCover, but that was exceedingly slow.
1421      */
1422     import std.random : randomShuffle;
1424     auto randomGenerator = Random(cmdopt.seed);
1425     inputLines.randomShuffle(randomGenerator);
1427     foreach (ref line; inputLines)
1428     {
1429         outputStream.put(line.data);
1430         outputStream.put("\n");
1431     }
1432 }
1434 /** Simple random sampling with replacement.
1435  *
1436  * All lines in files and/or standard input are read in. Then random lines are selected
1437  * one at a time and output. Lines can be selected multiple times. This process continues
1438  * until the desired number of samples (--n|num) has been output. Output continues
1439  * indefinitely if a sample size was not provided.
1440  */
1441 void simpleRandomSamplingWithReplacement(OutputRange)
1442     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1443 if (isOutputRange!(OutputRange, char))
1444 {
1445     import std.algorithm : map;
1446     import std.random : Random = Mt19937, uniform;
1448     /*
1449      * Read all file data into memory and split the data into lines.
1450      */
1451     const fileData = readFileData!(No.hasRandomValue)(cmdopt, outputStream);
1452     const inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt);
1454     if (inputLines.length > 0)
1455     {
1456         auto randomGenerator = Random(cmdopt.seed);
1458         /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */
1459         size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize;
1460         while (numLeft != 0)
1461         {
1462             immutable size_t index = uniform(0, inputLines.length, randomGenerator);
1463             outputStream.put(inputLines[index].data);
1464             outputStream.put("\n");
1465             if (cmdopt.sampleSize != 0) numLeft--;
1466         }
1467     }
1468 }
1470 /** A container holding data read from a file or standard input.
1471  *
1472  * The InputBlock struct is used to represent a block of data read from a file or
1473  * standard input. An array of InputBlocks is returned by readFileData. Typically one
1474  * block per file. Multiple blocks are used for standard input and when the file size
1475  * cannot be determined. Individual lines are not allowed to span blocks. The blocks
1476  * allocated to an individual file are numbered starting with zero.
1477  *
1478  * See readFileData() for more information.
1479  */
1480 static struct InputBlock
1481 {
1482     string filename;          /// Original filename or path. "-" denotes standard input.
1483     size_t fileBlockNumber;   /// Zero-based block number for the file.
1484     char[] data;              /// The actual data. Newline terminated or last block for the file.
1485 }
1487 /** Read data from one or more files. This routine is used by algorithms needing to
1488  * read all data into memory.
1489  *
1490  * readFileData reads in all data from a set of files. Data is returned as an array
1491  * of InputBlock structs. Normally one InputBlock per file, sized to match the size
1492  * of the file. Standard input is read in one or more blocks, as are files whose size
1493  * cannot be determined. Multiple blocks are used in these last two cases to avoid
1494  * expensive memory reallocations. This is not necessary when file size is known as
1495  * the necessary memory can be preallocated.
1496  *
1497  * Individual lines never span multiple blocks, and newlines are preserved. This
1498  * means that each block starts at the beginning of a line and ends with a newline
1499  * unless the end of a file has been reached.
1500  *
1501  * Each file gets its own block. Prior to using InputSourceRange this was so header
1502  * processing can be done. With InputSourceRange the header is read separately, so
1503  * this could be changed.
1504  */
1505 InputBlock[] readFileData(HasRandomValue hasRandomValue, OutputRange)
1506 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1507 if (isOutputRange!(OutputRange, char))
1508 {
1509     import std.algorithm : find, min;
1510     import std.range : retro;
1511     import tsv_utils.common.utils : InputSourceRange, throwIfWindowsNewlineOnUnix;
1513     static if(!hasRandomValue) assert(!cmdopt.printRandom);
1515     assert(!cmdopt.inputSources.empty);
1516     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1518     /* First header is read during command line argument processing. */
1519     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1520     {
1521         auto inputStream = cmdopt.inputSources.front;
1522         throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1524         if (cmdopt.printRandom)
1525         {
1526             outputStream.put(cmdopt.randomValueHeader);
1527             outputStream.put(cmdopt.delim);
1528         }
1529         outputStream.put(inputStream.header);
1530         outputStream.put("\n");
1531     }
1533     enum BlockSize = 1024L * 1024L * 1024L;  // 1 GB. ('L' notation avoids overflow w/ 2GB+ sizes.)
1534     enum ReadSize = 1024L * 128L;
1535     enum NewlineSearchSize = 1024L * 16L;
1537     InputBlock[] blocks;
1538     auto blocksAppender = appender(&blocks);
1539     blocksAppender.reserve(cmdopt.inputSources.length);  // At least one block per file.
1541     ubyte[] rawReadBuffer = new ubyte[ReadSize];
1543     foreach (inputStream; cmdopt.inputSources)
1544     {
1545         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1547         /* If the file size can be determined then read it as a single block.
1548          * Otherwise read as multiple blocks. File.size() returns ulong.max
1549          * if file size cannot be determined, so we'll combine that check
1550          * with the standard input case.
1551          */
1553         immutable ulong filesize = inputStream.isStdin ? ulong.max : inputStream.file.size;
1554         auto ifile = inputStream.file;
1556         if (filesize != ulong.max)
1557         {
1558             readFileDataAsOneBlock(inputStream.name, ifile, filesize,
1559                                    blocksAppender, rawReadBuffer);
1560         }
1561         else
1562         {
1563             readFileDataAsMultipleBlocks(
1564                 inputStream.name, ifile, blocksAppender, rawReadBuffer,
1565                 BlockSize, NewlineSearchSize);
1566         }
1567     }
1568     return blocks;
1569 }
1571 /* readFileData() helper function. Read data from a File handle as a single block. The
1572  * new block is appended to an existing InputBlock[] array.
1573  *
1574  * readFileDataAsOneBlocks is part of the readFileData logic. It handles the case
1575  * where a file is being read as a single block. Normally initialBlockSize is passed
1576  * as the size of the file.
1577  *
1578  * This routine has been separated out to enable unit testing. At present it is not
1579  * intended as a general API. See readFileData for more info.
1580  */
1581 private void readFileDataAsOneBlock(
1582     string filename,
1583     ref File ifile,
1584     const ulong initialBlockSize,
1585     ref RefAppender!(InputBlock[]) blocksAppender,
1586     ref ubyte[] rawReadBuffer)
1587 {
1588     blocksAppender.put(InputBlock(filename, 0));
1589     auto dataAppender = appender(&(blocksAppender.data[$-1].data));
1590     dataAppender.reserve(initialBlockSize);
1592     foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer))
1593     {
1594         dataAppender.put(cast(char[]) buffer);
1595     }
1596 }
1598 /* readFileData() helper function. Read data from a File handle as one or more blocks.
1599  * Blocks are appended to an existing InputBlock[] array.
1600  *
1601  * readFileDataAsMultipleBlocks is part of the readFileData logic. It handles the case
1602  * where a file or standard input is being read as a series of blocks. This is the
1603  * standard approach for standard input, but also applies when the file size cannot be
1604  * determined.
1605  *
1606  * This routine has been separated out to enable unit testing. At present it is not
1607  * intended as a general API. See readFileData for more info.
1608  */
1609 private void readFileDataAsMultipleBlocks(
1610     string filename,
1611     ref File ifile,
1612     ref RefAppender!(InputBlock[]) blocksAppender,
1613     ref ubyte[] rawReadBuffer,
1614     const size_t blockSize,
1615     const size_t newlineSearchSize)
1616 {
1617     import std.algorithm : find, min;
1618     import std.range : retro;
1620     assert(ifile.isOpen);
1622     /* Create a new block for the file and an Appender for writing data.
1623      */
1624     blocksAppender.put(InputBlock(filename, 0));
1625     auto dataAppender = appender(&(blocksAppender.data[$-1].data));
1626     dataAppender.reserve(blockSize);
1627     size_t blockNumber = 0;
1629     /* Read all the data and copy it to an InputBlock. */
1630     foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer))
1631     {
1632         assert(blockNumber == blocksAppender.data[$-1].fileBlockNumber);
1634         immutable size_t remainingCapacity = dataAppender.capacity - dataAppender.data.length;
1636         if (buffer.length <= remainingCapacity)
1637         {
1638             dataAppender.put(cast(char[]) buffer);
1639         }
1640         else
1641         {
1642             /* Look for the last newline in the input buffer that fits in remaining
1643              * capacity of the block.
1644              */
1645             auto searchRegion = buffer[0 .. remainingCapacity];
1646             auto appendRegion = searchRegion.retro.find('\n').source;
1648             if (appendRegion.length > 0)
1649             {
1650                 /* Copy the first part of the read buffer to the block. */
1651                 dataAppender.put(cast(char[]) appendRegion);
1653                 /* Create a new InputBlock and copy the remaining data to it. */
1654                 blockNumber++;
1655                 blocksAppender.put(InputBlock(filename, blockNumber));
1656                 dataAppender = appender(&(blocksAppender.data[$-1].data));
1657                 dataAppender.reserve(blockSize);
1658                 dataAppender.put(cast(char[]) buffer[appendRegion.length .. $]);
1660                 assert(blocksAppender.data.length >= 2);
1661                 assert(blocksAppender.data[$-2].data[$-1] == '\n');
1662             }
1663             else
1664             {
1665                 /* Search backward in the current block for a newline. If found, it
1666                  * becomes the last newline in the current block. Anything following
1667                  * it is moved to the block. If a newline is not found, simply append
1668                  * to the current block and let it grow. We'll only search backward
1669                  * so far.
1670                  */
1671                 immutable size_t currBlockLength = blocksAppender.data[$-1].data.length;
1672                 immutable size_t searchLength = min(currBlockLength, newlineSearchSize);
1673                 immutable size_t searchStart = currBlockLength - searchLength;
1674                 auto blockSearchRegion = blocksAppender.data[$-1].data[searchStart .. $];
1675                 auto lastNewlineOffset = blockSearchRegion.retro.find('\n').source.length;
1677                 if (lastNewlineOffset != 0)
1678                 {
1679                     /* Create a new InputBlock. The previous InputBlock is then found
1680                      * at blocksAppender.data[$-2]. It may be a physically different
1681                      * struct (a copy) if the blocks array gets reallocated.
1682                      */
1683                     blockNumber++;
1684                     blocksAppender.put(InputBlock(filename, blockNumber));
1685                     dataAppender = appender(&(blocksAppender.data[$-1].data));
1686                     dataAppender.reserve(blockSize);
1688                     /* Copy data following the newline from the last block to the new
1689                      * block. Then append the current read buffer.
1690                      */
1691                     immutable size_t moveRegionStart = searchStart + lastNewlineOffset;
1692                     dataAppender.put(blocksAppender.data[$-2].data[moveRegionStart .. $]);
1693                     dataAppender.put(cast(char[]) buffer);
1695                     /* Now delete the moved region from the last block. */
1696                     blocksAppender.data[$-2].data.length = moveRegionStart;
1698                     assert(blocksAppender.data.length >= 2);
1699                     assert(blocksAppender.data[$-2].data[$-1] == '\n');
1700                 }
1701                 else
1702                 {
1703                     /* Give up. Allow the current block to grow. */
1704                     dataAppender.put(cast(char[]) buffer);
1705                 }
1706             }
1707         }
1708     }
1709 }
1711 /** HasRandomValue is a boolean flag used at compile time by identifyInputLines to
1712  * distinguish use cases needing random value assignments from those that don't.
1713  */
1714 alias HasRandomValue = Flag!"hasRandomValue";
1716 /** An InputLine array is returned by identifyInputLines to represent each non-header line
1717  * line found in a FileData array. The 'data' element contains the line. A 'randomValue'
1718  * line is included if random values are being generated.
1719  */
1720 static struct InputLine(HasRandomValue hasRandomValue)
1721 {
1722     const(char)[] data;
1723     static if (hasRandomValue) double randomValue;
1724 }
1726 /** identifyInputLines is used by algorithms that read all files into memory prior to
1727  * processing. It does the initial processing of the file data.
1728  *
1729  * Two main tasks are performed. One is splitting all input data into lines. The second
1730  * is assigning a random value to the line, if random values are being generated.
1731  *
1732  * The key input is an InputBlock array. Normally one block for each file, but standard
1733  * input may have multiple blocks.
1734  *
1735  * The return value is an array of InputLine structs. The struct will have a 'randomValue'
1736  * member if random values are being assigned.
1737  */
1738 InputLine!hasRandomValue[] identifyInputLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted)
1739 (const ref InputBlock[] inputBlocks, ref TsvSampleOptions cmdopt)
1740 {
1741     import std.algorithm : splitter;
1742     import std.array : appender;
1743     import std.random : Random = Mt19937, uniform01;
1744     import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
1746     static assert(hasRandomValue || !isWeighted);
1747     static if(!hasRandomValue) assert(!cmdopt.printRandom);
1749     InputLine!hasRandomValue[] inputLines;
1751     auto linesAppender = appender(&inputLines);
1752     static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed);
1754     /* Note: fileLineNum is zero-based here. One-based in most other code in this file. */
1755     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 1 : 0;
1756     size_t fileLineNum = fileBodyStartLine;
1758     foreach (block; inputBlocks)
1759     {
1760         /* Drop the last newline to avoid adding an extra empty line. */
1761         const data = (block.data.length > 0 && block.data[$-1] == '\n') ?
1762             block.data[0 .. $-1] : block.data;
1764         if (block.fileBlockNumber == 0) fileLineNum = fileBodyStartLine;
1766         foreach (ref line; data.splitter('\n'))
1767         {
1768             fileLineNum++;
1770             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, block.filename, fileLineNum);
1772             static if (!hasRandomValue)
1773             {
1774                 linesAppender.put(InputLine!hasRandomValue(line));
1775             }
1776             else
1777             {
1778                 static if (!isWeighted)
1779                 {
1780                     immutable double randomValue = uniform01(randomGenerator);
1781                 }
1782                 else
1783                 {
1784                     immutable double lineWeight =
1785                         getFieldValue!double(line, cmdopt.weightField, cmdopt.delim,
1786                                              block.filename, fileLineNum);
1787                     immutable double randomValue =
1788                         (lineWeight > 0.0)
1789                         ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1790                         : 0.0;
1791                 }
1793                 linesAppender.put(InputLine!hasRandomValue(line, randomValue));
1794             }
1795         }
1796     }
1798     return inputLines;
1799 }
1802 /* Unit tests for ReadFileData. These tests focus on multiple InputBlock scenarios.
1803  * Other use paths are well tested by the tests at the end cases.
1804  */
1805 unittest
1806 {
1807     import tsv_utils.common.unittest_utils;
1808     import std.algorithm : equal, find, joiner, splitter;
1809     import std.array : appender;
1810     import std.file : rmdirRecurse;
1811     import std.path : buildPath;
1812     import std.range : repeat;
1814     auto rfdTestDir = makeUnittestTempDir("tsv_sample_readFileData");
1815     scope(exit) rfdTestDir.rmdirRecurse;
1817     char[] file1Data;
1818     char[] file2Data;
1819     char[] file3Data;
1821     auto app1 = appender(&file1Data);
1822     auto app2 = appender(&file2Data);
1823     auto app3 = appender(&file3Data);
1825     /* File 1: 1000 short lines. */
1826     app1.put("\n".repeat(100).joiner);
1827     app1.put("x\n".repeat(100).joiner);
1828     app1.put("yz\n".repeat(100).joiner);
1829     app1.put("pqr\n".repeat(100).joiner);
1830     app1.put("a\nbc\ndef\n".repeat(100).joiner);
1831     app1.put('\n'.repeat(100));
1832     app1.put("z\n".repeat(100).joiner);
1833     app1.put("xy\n".repeat(100).joiner);
1835     /* File 2: 500 longer lines. */
1836     app2.put(
1837         "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n"
1838         .repeat(100)
1839         .joiner);
1840     app2.put(
1841         "|abcdefghijklmnopqrstuv|\n|0123456789|\n|0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ|\n|abcdefghijklmnopqrstuvwxyz|\n"
1842         .repeat(100)
1843         .joiner);
1844     app2.put(
1845          "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n"
1846         .repeat(100)
1847         .joiner);
1849     /* File 3: 1000 mixed length lines. */
1850     app3.put("\n\n|abcde|\n1\n12\n123\n|abcdefghijklmnop|\n|xyz|\n0123456789\nX\n".repeat(100).joiner);
1852     string file1Path = buildPath(rfdTestDir, "file1.txt");
1853     string file2Path = buildPath(rfdTestDir, "file2.txt");
1854     string file3Path = buildPath(rfdTestDir, "file3.txt");
1856     try
1857     {
1858         auto ofile1 = File(file1Path, "w");
1859         ofile1.write(file1Data);
1860     }
1861     catch (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file1Path, e.msg));
1863     try
1864     {
1865         auto ofile2 = File(file2Path, "w");
1866         ofile2.write(file2Data);
1867     }
1868     catch (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file2Path, e.msg));
1870     try
1871     {
1872         auto ofile3 = File(file3Path, "w");
1873         ofile3.write(file3Data);
1874     }
1875     catch  (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file3Path, e.msg));
1877     auto allData = file1Data ~ file2Data ~ file3Data;
1878     auto expectedLines = allData.splitter('\n').array[0 .. $-1];
1880     auto file2DataNoHeader = (file2Data.find('\n'))[1 .. $];
1881     auto file3DataNoHeader = (file3Data.find('\n'))[1 .. $];
1882     auto allDataUsingHeader = file1Data ~ file2DataNoHeader ~ file3DataNoHeader;
1883     auto expectedLinesUsingHeader = allDataUsingHeader.splitter('\n').array[0 .. $-1];
1885     assert(expectedLines.length == expectedLinesUsingHeader.length + 2);
1887     TsvSampleOptions cmdoptNoHeader;
1888     auto noHeaderCmdArgs = ["unittest", file1Path];
1889     auto r1 = cmdoptNoHeader.processArgs(noHeaderCmdArgs);
1890     assert(r1[0], format("Invalid command lines arg: '%s'.", noHeaderCmdArgs));
1892     TsvSampleOptions cmdoptYesHeader;
1893     auto yesHeaderCmdArgs = ["unittest", "--header", file1Path];
1894     auto r2 = cmdoptYesHeader.processArgs(yesHeaderCmdArgs);
1895     assert(r2[0], format("Invalid command lines arg: '%s'.", yesHeaderCmdArgs));
1897     auto outputStream = appender!(char[])();
1899     {
1900         /* Reading as single blocks. */
1901         ubyte[] rawReadBuffer = new ubyte[256];
1902         InputBlock[] blocks;
1903         auto blocksAppender = appender(&blocks);
1904         blocksAppender.reserve(3);
1905         foreach (f; [ file1Path, file2Path, file3Path ])
1906         {
1907             auto ifile = f.File;
1908             ulong filesize = ifile.size;
1909             if (filesize == ulong.max) filesize = 1000;
1910             readFileDataAsOneBlock(f, ifile, filesize, blocksAppender, rawReadBuffer);
1911         }
1912         auto inputLines =
1913             identifyInputLines!(No.hasRandomValue, No.isWeighted)(
1914                 blocks, cmdoptNoHeader);
1916         assert(equal!((a, b) => a.data == b)(inputLines, expectedLines));
1917     }
1919     {
1920         /* Reading as multiple blocks. */
1921         foreach (size_t searchSize; [ 0, 1, 2, 64 ])
1922         {
1923             foreach (size_t blockSize; [ 1, 2, 16, 64, 256 ])
1924             {
1925                 foreach (size_t readSize; [ 1, 2, 8, 32 ])
1926                 {
1927                     ubyte[] rawReadBuffer = new ubyte[readSize];
1928                     InputBlock[] blocks;
1929                     auto blocksAppender = appender(&blocks);
1930                     blocksAppender.reserve(3);
1931                     foreach (f; [ file1Path, file2Path, file3Path ])
1932                     {
1933                         auto ifile = f.File;
1934                         readFileDataAsMultipleBlocks(f, ifile, blocksAppender,
1935                                                      rawReadBuffer, blockSize, searchSize);
1936                     }
1937                     auto inputLines =
1938                         identifyInputLines!(No.hasRandomValue, No.isWeighted)(
1939                             blocks, cmdoptNoHeader);
1941                     assert(equal!((a, b) => a.data == b)(inputLines, expectedLines));
1942                 }
1943             }
1944         }
1945     }
1946     version(none) {
1947     {
1948         /* Reading as multiple blocks, with header processing. */
1949         const size_t readSize = 32;
1950         const size_t blockSize = 48;
1951         const size_t searchSize = 16;
1953         ubyte[] rawReadBuffer = new ubyte[readSize];
1954         InputBlock[] blocks;
1955         auto blocksAppender = appender(&blocks);
1956         blocksAppender.reserve(3);
1957         foreach (f; [ file1Path, file2Path, file3Path ])
1958         {
1959             auto ifile = f.File;
1960             readFileDataAsMultipleBlocks(f, ifile, blocksAppender,
1961                                          rawReadBuffer, blockSize, searchSize);
1962         }
1963         auto inputLines =
1964             identifyInputLines!(No.hasRandomValue, No.isWeighted)(
1965                 blocks, cmdoptYesHeader);
1967         assert(outputStream.data == expectedLinesUsingHeader[0] ~ '\n');
1968         assert(equal!((a, b) => a.data == b)(inputLines, expectedLinesUsingHeader[1 .. $]));
1969     }
1970     }
1971 }
1973 /** Write a floating point random value to an output stream.
1974  *
1975  * This routine is used for floating point random value printing. This routine writes
1976  * 17 significant digits, the range available in doubles. This routine prefers decimal
1977  * format, without exponents. It will generate somewhat large precision numbers,
1978  * currently up to 28 digits, before switching to exponents.
1979  *
1980  * The primary reason for this approach is to enable faster sorting on random values
1981  * by GNU sort and similar external sorting programs. GNU sort is dramatically faster
1982  * on decimal format numeric sorts ('n' switch) than general numeric sorts ('g' switch).
1983  * The 'general numeric' handles exponential notation. The difference is 5-10x.
1984  *
1985  * Random values generated by Bernoulli sampling are nearly always greater than 1e-12.
1986  * No examples less than 1e-09 were seen in hundred of millions of trials. Similar
1987  * results were seen with weighted sampling with integer weights. The same is not true
1988  * with floating point weights. These produce quite large exponents. However, even
1989  * for floating point weights this can be useful. For random weights [0,1] less than 5%
1990  * will be less than 1e-12 and use exponential notation.
1991  */
1992 void formatRandomValue(OutputRange)(auto ref OutputRange outputStream, double value)
1993 if (isOutputRange!(OutputRange, char))
1994 {
1995     import std.format : formatValue, singleSpec;
1997     immutable spec17f = singleSpec("%.17f");
1998     immutable spec18f = singleSpec("%.18f");
1999     immutable spec19f = singleSpec("%.19f");
2000     immutable spec20f = singleSpec("%.20f");
2001     immutable spec21f = singleSpec("%.21f");
2002     immutable spec22f = singleSpec("%.22f");
2003     immutable spec23f = singleSpec("%.23f");
2004     immutable spec24f = singleSpec("%.24f");
2005     immutable spec25f = singleSpec("%.25f");
2006     immutable spec26f = singleSpec("%.26f");
2007     immutable spec27f = singleSpec("%.27f");
2008     immutable spec28f = singleSpec("%.28f");
2010     immutable spec17g = singleSpec("%.17g");
2012     immutable formatSpec =
2013         (value >= 1e-01) ? spec17f :
2014         (value >= 1e-02) ? spec18f :
2015         (value >= 1e-03) ? spec19f :
2016         (value >= 1e-04) ? spec20f :
2017         (value >= 1e-05) ? spec21f :
2018         (value >= 1e-06) ? spec22f :
2019         (value >= 1e-07) ? spec23f :
2020         (value >= 1e-08) ? spec24f :
2021         (value >= 1e-09) ? spec25f :
2022         (value >= 1e-10) ? spec26f :
2023         (value >= 1e-11) ? spec27f :
2024         (value >= 1e-12) ? spec28f : spec17g;
2026     outputStream.formatValue(value, formatSpec);
2027 }
2029 @safe unittest
2030 {
2031     void testFormatValue(double value, string expected)
2032     {
2033         import std.array : appender;
2035         auto s = appender!string();
2036         s.formatRandomValue(value);
2037         assert(s.data == expected,
2038                format("[testFormatValue] value: %g; expected: %s; actual: %s", value, expected, s.data));
2039     }
2041     testFormatValue(1.0,   "1.00000000000000000");
2042     testFormatValue(0.1,   "0.10000000000000001");
2043     testFormatValue(0.01,  "0.010000000000000000");
2044     testFormatValue(1e-03, "0.0010000000000000000");
2045     testFormatValue(1e-04, "0.00010000000000000000");
2046     testFormatValue(1e-05, "0.000010000000000000001");
2047     testFormatValue(1e-06, "0.0000010000000000000000");
2048     testFormatValue(1e-07, "0.00000010000000000000000");
2049     testFormatValue(1e-08, "0.000000010000000000000000");
2050     testFormatValue(1e-09, "0.0000000010000000000000001");
2051     testFormatValue(1e-10, "0.00000000010000000000000000");
2052     testFormatValue(1e-11, "0.000000000009999999999999999");
2053     testFormatValue(1e-12, "0.0000000000010000000000000000");
2054     testFormatValue(1e-13, "1e-13");
2055     testFormatValue(1e-14, "1e-14");
2056     testFormatValue(12345678901234567e-15, "12.34567890123456735");
2057     testFormatValue(12345678901234567e-16, "1.23456789012345669");
2058     testFormatValue(12345678901234567e-17, "0.12345678901234566");
2059     testFormatValue(12345678901234567e-18, "0.012345678901234567");
2060     testFormatValue(12345678901234567e-19, "0.0012345678901234567");
2061     testFormatValue(12345678901234567e-20, "0.00012345678901234567");
2062     testFormatValue(12345678901234567e-21, "0.000012345678901234568");
2063     testFormatValue(12345678901234567e-22, "0.0000012345678901234567");
2064     testFormatValue(12345678901234567e-23, "0.00000012345678901234566");
2065     testFormatValue(12345678901234567e-24, "0.000000012345678901234567");
2066     testFormatValue(12345678901234567e-25, "0.0000000012345678901234566");
2067     testFormatValue(12345678901234567e-26, "0.00000000012345678901234568");
2068     testFormatValue(12345678901234567e-27, "0.000000000012345678901234567");
2069     testFormatValue(12345678901234567e-28, "0.0000000000012345678901234567");
2070     testFormatValue(12345678901234567e-29, "1.2345678901234566e-13");
2071 }
2074 /** Convenience function for extracting a single field from a line. See
2075  * [tsv_utils.common.utils.getTsvFieldValue] for details. This wrapper creates error
2076  * text tailored for this program.
2077  */
2078 import std.traits : isSomeChar;
2079 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, ulong lineNum) pure @safe
2080 if (isSomeChar!C)
2081 {
2082     import std.conv : ConvException, to;
2083     import tsv_utils.common.utils : getTsvFieldValue;
2085     T val;
2086     try
2087     {
2088         val = getTsvFieldValue!T(line, fieldIndex, delim);
2089     }
2090     catch (ConvException exc)
2091     {
2092         throw new Exception(
2093             format("Could not process line: %s\n  File: %s Line: %s%s",
2094                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
2095                    (lineNum == 1) ? "\n  Is this a header line? Use --H|header to skip." : ""));
2096     }
2097     catch (Exception exc)
2098     {
2099         /* Not enough fields on the line. */
2100         throw new Exception(
2101             format("Could not process line: %s\n  File: %s Line: %s",
2102                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum));
2103     }
2105     return val;
2106 }
2108 @safe unittest
2109 {
2110     /* getFieldValue unit tests. getTsvFieldValue has it's own tests.
2111      * These tests make basic sanity checks on the getFieldValue wrapper.
2112      */
2113     import std.exception;
2115     assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123);
2116     assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4);
2117     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1));
2118     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2));
2119     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1));
2120     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2));
2121 }
2123 /* Unit tests for the main program start here.
2124  *
2125  * Portability note: Many of the tests here rely on generating consistent random numbers
2126  * across different platforms when using the same random seed. So far this has succeeded
2127  * on several different platform, compiler, and library versions. However, it is certainly
2128  * possible this condition will not hold on other platforms.
2129  *
2130  * For tsv-sample, this portability implies generating the same results on different
2131  * platforms when using the same random seed. This is NOT part of tsv-sample guarantees,
2132  * but it is convenient for testing. If platforms are identified that do not generate
2133  * the same results these tests will need to be adjusted.
2134  */
2135 version(unittest)
2136 {
2137     /* Unit test helper functions. */
2139     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
2140     import std.conv : to;
2142     void testTsvSample(string[] cmdArgs, string[][] expected)
2143     {
2144         import std.array : appender;
2146         assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty.");
2148         auto formatAssertMessage(T...)(string msg, T formatArgs)
2149         {
2150             auto formatString = "[testTsvSample] %s: " ~ msg;
2151             return format(formatString, cmdArgs[0], formatArgs);
2152         }
2154         TsvSampleOptions cmdopt;
2155         auto savedCmdArgs = cmdArgs.to!string;
2156         auto r = cmdopt.processArgs(cmdArgs);
2157         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
2158         auto output = appender!(char[])();
2160         tsvSample(cmdopt, output);    // This invokes the main code line.
2162         auto expectedOutput = expected.tsvDataToString;
2164         assert(output.data == expectedOutput,
2165                formatAssertMessage(
2166                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
2167                    expectedOutput.to!string, output.data.to!string));
2168     }
2169  }
2171 unittest
2172 {
2173     import std.path : buildPath;
2174     import std.file : rmdirRecurse;
2176     auto testDir = makeUnittestTempDir("tsv_sample");
2177     scope(exit) testDir.rmdirRecurse;
2179     /* Tabular data sets and expected results use the built-in static seed.
2180      * Tests are run by writing the data set to a file, then calling the main
2181      * routine to process. The function testTsvSample plays the role of the
2182      * main program. Rather than writing to expected output, the results are
2183      * matched against expected. The expected results were verified by hand
2184      * prior to inclusion in the test.
2185      *
2186      * The initial part of this section is simply setting up data files and
2187      * expected results.
2188      *
2189      * Expected results naming conventions:
2190      *  - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected
2191      *  - Sampling Type (required): Permute (Shuffle), Sample, Replace, Bernoulli, Distinct
2192      *  - Compatibility: Compat, AlgoR, Skip, Swap, Inorder
2193      *  - Weight Field: Wt<num>, e.g. Wt3
2194      *  - Sample Size: Num<num>, eg. Num3
2195      *  - Seed Value: V<num>, eg. V77
2196      *  - Key Field: K<num>, e.g. K2
2197      *  - Probability: P<num>, e.g P05 (5%)
2198      *  - Printing Probabilities: Probs
2199      *  - Printing Probs in order: ProbsInorder
2200      *  - Printing Probs with custom header: RVCustom
2201      */
2203     /* Empty file. */
2204     string[][] dataEmpty = [];
2205     string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv");
2206     writeUnittestTsvFile(fpath_dataEmpty, dataEmpty);
2208     /* 3x1, header only. */
2209     string[][] data3x0 = [["field_a", "field_b", "field_c"]];
2210     string fpath_data3x0 = buildPath(testDir, "data3x0.tsv");
2211     writeUnittestTsvFile(fpath_data3x0, data3x0);
2213     /* 3x1 */
2214     string[][] data3x1 =
2215         [["field_a", "field_b", "field_c"],
2216          ["tan", "タン", "8.5"]];
2218     string fpath_data3x1 = buildPath(testDir, "data3x1.tsv");
2219     string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv");
2220     writeUnittestTsvFile(fpath_data3x1, data3x1);
2221     writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1 .. $]);
2223     string[][] data3x1ExpectedReplaceNum3 =
2224         [["field_a", "field_b", "field_c"],
2225          ["tan", "タン", "8.5"],
2226          ["tan", "タン", "8.5"],
2227          ["tan", "タン", "8.5"]];
2229     /* 3x2 */
2230     string[][] data3x2 =
2231         [["field_a", "field_b", "field_c"],
2232          ["brown", "褐色", "29.2"],
2233          ["gray", "グレー", "6.2"]];
2235     string fpath_data3x2 = buildPath(testDir, "data3x2.tsv");
2236     string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv");
2237     writeUnittestTsvFile(fpath_data3x2, data3x2);
2238     writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1 .. $]);
2240     string[][] data3x2PermuteCompat =
2241         [["field_a", "field_b", "field_c"],
2242          ["gray", "グレー", "6.2"],
2243          ["brown", "褐色", "29.2"]];
2245     string[][] data3x2PermuteShuffle =
2246         [["field_a", "field_b", "field_c"],
2247          ["gray", "グレー", "6.2"],
2248          ["brown", "褐色", "29.2"]];
2250     /* 3x3 */
2251     string[][] data3x3 =
2252         [["field_a", "field_b", "field_c"],
2253          ["orange", "オレンジ", "2.5"],
2254          ["pink", "ピンク", "1.1"],
2255          ["purple", "紫の", "42"]];
2257     string fpath_data3x3 = buildPath(testDir, "data3x3.tsv");
2258     string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv");
2259     writeUnittestTsvFile(fpath_data3x3, data3x3);
2260     writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1 .. $]);
2262     string[][] data3x3ExpectedPermuteCompat =
2263         [["field_a", "field_b", "field_c"],
2264          ["purple", "紫の", "42"],
2265          ["pink", "ピンク", "1.1"],
2266          ["orange", "オレンジ", "2.5"]];
2268     string[][] data3x3ExpectedPermuteSwap =
2269         [["field_a", "field_b", "field_c"],
2270          ["purple", "紫の", "42"],
2271          ["orange", "オレンジ", "2.5"],
2272          ["pink", "ピンク", "1.1"]];
2274     /* 3x6 */
2275     string[][] data3x6 =
2276         [["field_a", "field_b", "field_c"],
2277          ["red", "赤", "23.8"],
2278          ["green", "緑", "0.0072"],
2279          ["white", "白", "1.65"],
2280          ["yellow", "黄", "12"],
2281          ["blue", "青", "12"],
2282          ["black", "黒", "0.983"]];
2283     string fpath_data3x6 = buildPath(testDir, "data3x6.tsv");
2284     string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv");
2285     writeUnittestTsvFile(fpath_data3x6, data3x6);
2286     writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1 .. $]);
2288     // Randomization, all lines
2289     string[][] data3x6ExpectedPermuteCompat =
2290         [["field_a", "field_b", "field_c"],
2291          ["yellow", "黄", "12"],
2292          ["black", "黒", "0.983"],
2293          ["blue", "青", "12"],
2294          ["white", "白", "1.65"],
2295          ["green", "緑", "0.0072"],
2296          ["red", "赤", "23.8"]];
2298     string[][] data3x6ExpectedPermuteSwap =
2299         [["field_a", "field_b", "field_c"],
2300          ["black", "黒", "0.983"],
2301          ["green", "緑", "0.0072"],
2302          ["red", "赤", "23.8"],
2303          ["yellow", "黄", "12"],
2304          ["white", "白", "1.65"],
2305          ["blue", "青", "12"]];
2307     string[][] data3x6ExpectedPermuteCompatProbs =
2308         [["random_value", "field_a", "field_b", "field_c"],
2309          ["0.96055546286515892", "yellow", "黄", "12"],
2310          ["0.75710153928957880", "black", "黒", "0.983"],
2311          ["0.52525980887003243", "blue", "青", "12"],
2312          ["0.49287854949943721", "white", "白", "1.65"],
2313          ["0.15929344086907804", "green", "緑", "0.0072"],
2314          ["0.010968807619065046", "red", "赤", "23.8"]];
2316     /* Note: data3x6ExpectedSampleAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because
2317      * both are effectively the same algorithm given that --num is data length. Both read
2318      * in the full data in order then call randomShuffle.
2319      */
2320     string[][] data3x6ExpectedSampleAlgoRNum6 =
2321         [["field_a", "field_b", "field_c"],
2322          ["black", "黒", "0.983"],
2323          ["green", "緑", "0.0072"],
2324          ["red", "赤", "23.8"],
2325          ["yellow", "黄", "12"],
2326          ["white", "白", "1.65"],
2327          ["blue", "青", "12"]];
2329     string[][] data3x6ExpectedSampleAlgoRNum5 =
2330         [["field_a", "field_b", "field_c"],
2331          ["red", "赤", "23.8"],
2332          ["black", "黒", "0.983"],
2333          ["white", "白", "1.65"],
2334          ["green", "緑", "0.0072"],
2335          ["yellow", "黄", "12"]];
2337     string[][] data3x6ExpectedSampleAlgoRNum4 =
2338         [["field_a", "field_b", "field_c"],
2339          ["blue", "青", "12"],
2340          ["green", "緑", "0.0072"],
2341          ["black", "黒", "0.983"],
2342          ["white", "白", "1.65"]];
2344     string[][] data3x6ExpectedSampleAlgoRNum3 =
2345         [["field_a", "field_b", "field_c"],
2346          ["red", "赤", "23.8"],
2347          ["black", "黒", "0.983"],
2348          ["green", "緑", "0.0072"]];
2350     string[][] data3x6ExpectedSampleAlgoRNum2 =
2351         [["field_a", "field_b", "field_c"],
2352          ["black", "黒", "0.983"],
2353          ["red", "赤", "23.8"]];
2355     string[][] data3x6ExpectedSampleAlgoRNum1 =
2356         [["field_a", "field_b", "field_c"],
2357          ["green", "緑", "0.0072"]];
2359     /* Inorder versions. */
2360     string[][] data3x6ExpectedSampleAlgoRNum6Inorder =
2361         [["field_a", "field_b", "field_c"],
2362          ["red", "赤", "23.8"],
2363          ["green", "緑", "0.0072"],
2364          ["white", "白", "1.65"],
2365          ["yellow", "黄", "12"],
2366          ["blue", "青", "12"],
2367          ["black", "黒", "0.983"]];
2369     string[][] data3x6ExpectedSampleAlgoRNum5Inorder =
2370         [["field_a", "field_b", "field_c"],
2371          ["red", "赤", "23.8"],
2372          ["green", "緑", "0.0072"],
2373          ["white", "白", "1.65"],
2374          ["yellow", "黄", "12"],
2375          ["black", "黒", "0.983"]];
2377     string[][] data3x6ExpectedSampleAlgoRNum4Inorder =
2378         [["field_a", "field_b", "field_c"],
2379          ["green", "緑", "0.0072"],
2380          ["white", "白", "1.65"],
2381          ["blue", "青", "12"],
2382          ["black", "黒", "0.983"]];
2384     string[][] data3x6ExpectedSampleAlgoRNum3Inorder =
2385         [["field_a", "field_b", "field_c"],
2386          ["red", "赤", "23.8"],
2387          ["green", "緑", "0.0072"],
2388          ["black", "黒", "0.983"]];
2390     string[][] data3x6ExpectedSampleAlgoRNum2Inorder =
2391         [["field_a", "field_b", "field_c"],
2392          ["red", "赤", "23.8"],
2393          ["black", "黒", "0.983"]];
2395     string[][] data3x6ExpectedSampleAlgoRNum1Inorder =
2396         [["field_a", "field_b", "field_c"],
2397          ["green", "緑", "0.0072"]];
2399     /* Reservoir inorder */
2400     string[][] data3x6ExpectedSampleCompatNum6Inorder =
2401         [["field_a", "field_b", "field_c"],
2402          ["red", "赤", "23.8"],
2403          ["green", "緑", "0.0072"],
2404          ["white", "白", "1.65"],
2405          ["yellow", "黄", "12"],
2406          ["blue", "青", "12"],
2407          ["black", "黒", "0.983"]];
2409     string[][] data3x6ExpectedSampleCompatNum5Inorder =
2410         [["field_a", "field_b", "field_c"],
2411          ["green", "緑", "0.0072"],
2412          ["white", "白", "1.65"],
2413          ["yellow", "黄", "12"],
2414          ["blue", "青", "12"],
2415          ["black", "黒", "0.983"]];
2417     string[][] data3x6ExpectedSampleCompatNum4Inorder =
2418         [["field_a", "field_b", "field_c"],
2419          ["white", "白", "1.65"],
2420          ["yellow", "黄", "12"],
2421          ["blue", "青", "12"],
2422          ["black", "黒", "0.983"]];
2424     string[][] data3x6ExpectedSampleCompatNum3Inorder =
2425         [["field_a", "field_b", "field_c"],
2426          ["yellow", "黄", "12"],
2427          ["blue", "青", "12"],
2428          ["black", "黒", "0.983"]];
2430     string[][] data3x6ExpectedSampleCompatNum2Inorder =
2431         [["field_a", "field_b", "field_c"],
2432          ["yellow", "黄", "12"],
2433          ["black", "黒", "0.983"]];
2435     string[][] data3x6ExpectedSampleCompatNum1Inorder =
2436         [["field_a", "field_b", "field_c"],
2437          ["yellow", "黄", "12"]];
2440     /* Reservoir inorder with probabilities. */
2441     string[][] data3x6ExpectedSampleCompatNum6ProbsInorder =
2442         [["random_value", "field_a", "field_b", "field_c"],
2443          ["0.010968807619065046", "red", "赤", "23.8"],
2444          ["0.15929344086907804", "green", "緑", "0.0072"],
2445          ["0.49287854949943721", "white", "白", "1.65"],
2446          ["0.96055546286515892", "yellow", "黄", "12"],
2447          ["0.52525980887003243", "blue", "青", "12"],
2448          ["0.75710153928957880", "black", "黒", "0.983"]];
2450     string[][] data3x6ExpectedSampleCompatNum5ProbsInorder =
2451         [["random_value", "field_a", "field_b", "field_c"],
2452          ["0.15929344086907804", "green", "緑", "0.0072"],
2453          ["0.49287854949943721", "white", "白", "1.65"],
2454          ["0.96055546286515892", "yellow", "黄", "12"],
2455          ["0.52525980887003243", "blue", "青", "12"],
2456          ["0.75710153928957880", "black", "黒", "0.983"]];
2458     string[][] data3x6ExpectedSampleCompatNum4ProbsInorder =
2459         [["random_value", "field_a", "field_b", "field_c"],
2460          ["0.49287854949943721", "white", "白", "1.65"],
2461          ["0.96055546286515892", "yellow", "黄", "12"],
2462          ["0.52525980887003243", "blue", "青", "12"],
2463          ["0.75710153928957880", "black", "黒", "0.983"]];
2465     string[][] data3x6ExpectedSampleCompatNum3ProbsInorder =
2466         [["random_value", "field_a", "field_b", "field_c"],
2467          ["0.96055546286515892", "yellow", "黄", "12"],
2468          ["0.52525980887003243", "blue", "青", "12"],
2469          ["0.75710153928957880", "black", "黒", "0.983"]];
2471     string[][] data3x6ExpectedSampleCompatNum2ProbsInorder =
2472         [["random_value", "field_a", "field_b", "field_c"],
2473          ["0.96055546286515892", "yellow", "黄", "12"],
2474          ["0.75710153928957880", "black", "黒", "0.983"]];
2476     string[][] data3x6ExpectedSampleCompatNum1ProbsInorder =
2477         [["random_value", "field_a", "field_b", "field_c"],
2478          ["0.96055546286515892", "yellow", "黄", "12"]];
2480     string[][] data3x6ExpectedWt3Num6Inorder =
2481         [["field_a", "field_b", "field_c"],
2482          ["red", "赤", "23.8"],
2483          ["green", "緑", "0.0072"],
2484          ["white", "白", "1.65"],
2485          ["yellow", "黄", "12"],
2486          ["blue", "青", "12"],
2487          ["black", "黒", "0.983"]];
2489     string[][] data3x6ExpectedWt3Num5Inorder =
2490         [["field_a", "field_b", "field_c"],
2491          ["green", "緑", "0.0072"],
2492          ["white", "白", "1.65"],
2493          ["yellow", "黄", "12"],
2494          ["blue", "青", "12"],
2495          ["black", "黒", "0.983"]];
2497     string[][] data3x6ExpectedWt3Num4Inorder =
2498         [["field_a", "field_b", "field_c"],
2499          ["white", "白", "1.65"],
2500          ["yellow", "黄", "12"],
2501          ["blue", "青", "12"],
2502          ["black", "黒", "0.983"]];
2504     string[][] data3x6ExpectedWt3Num3Inorder =
2505         [["field_a", "field_b", "field_c"],
2506          ["yellow", "黄", "12"],
2507          ["blue", "青", "12"],
2508          ["black", "黒", "0.983"]];
2510     string[][] data3x6ExpectedWt3Num2Inorder =
2511         [["field_a", "field_b", "field_c"],
2512          ["yellow", "黄", "12"],
2513          ["black", "黒", "0.983"]];
2515     string[][] data3x6ExpectedWt3Num1Inorder =
2516         [["field_a", "field_b", "field_c"],
2517          ["yellow", "黄", "12"]];
2520     string[][] data3x6ExpectedBernoulliProbsP100 =
2521         [["random_value", "field_a", "field_b", "field_c"],
2522          ["0.010968807619065046", "red", "赤", "23.8"],
2523          ["0.15929344086907804", "green", "緑", "0.0072"],
2524          ["0.49287854949943721", "white", "白", "1.65"],
2525          ["0.96055546286515892", "yellow", "黄", "12"],
2526          ["0.52525980887003243", "blue", "青", "12"],
2527          ["0.75710153928957880", "black", "黒", "0.983"]];
2529     string[][] data3x6ExpectedBernoulliCompatProbsP60 =
2530         [["random_value", "field_a", "field_b", "field_c"],
2531          ["0.010968807619065046", "red", "赤", "23.8"],
2532          ["0.15929344086907804", "green", "緑", "0.0072"],
2533          ["0.49287854949943721", "white", "白", "1.65"],
2534          ["0.52525980887003243", "blue", "青", "12"]];
2536     string[][] data3x6ExpectedBernoulliSkipP40 =
2537         [["field_a", "field_b", "field_c"],
2538          ["red", "赤", "23.8"],
2539          ["green", "緑", "0.0072"],
2540          ["yellow", "黄", "12"]];
2542     string[][] data3x6ExpectedBernoulliCompatP60 =
2543         [["field_a", "field_b", "field_c"],
2544          ["red", "赤", "23.8"],
2545          ["green", "緑", "0.0072"],
2546          ["white", "白", "1.65"],
2547          ["blue", "青", "12"]];
2549     string[][] data3x6ExpectedDistinctK1K3P60 =
2550         [["field_a", "field_b", "field_c"],
2551          ["green", "緑", "0.0072"],
2552          ["white", "白", "1.65"],
2553          ["blue", "青", "12"]];
2555     string[][] data3x6ExpectedDistinctK1K3P60Probs =
2556         [["random_value", "field_a", "field_b", "field_c"],
2557          ["0", "green", "緑", "0.0072"],
2558          ["0", "white", "白", "1.65"],
2559          ["0", "blue", "青", "12"]];
2561     string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom =
2562         [["custom_random_value_header", "field_a", "field_b", "field_c"],
2563          ["0", "green", "緑", "0.0072"],
2564          ["0", "white", "白", "1.65"],
2565          ["0", "blue", "青", "12"]];
2567     string[][] data3x6ExpectedDistinctK2P2ProbsInorder =
2568         [["random_value", "field_a", "field_b", "field_c"],
2569          ["1", "red", "赤", "23.8"],
2570          ["0", "green", "緑", "0.0072"],
2571          ["0", "white", "白", "1.65"],
2572          ["1", "yellow", "黄", "12"],
2573          ["3", "blue", "青", "12"],
2574          ["2", "black", "黒", "0.983"]];
2576     string[][] data3x6ExpectedPermuteWt3Probs =
2577         [["random_value", "field_a", "field_b", "field_c"],
2578          ["0.99665198757645390", "yellow", "黄", "12"],
2579          ["0.94775884809836686", "blue", "青", "12"],
2580          ["0.82728234682286661", "red", "赤", "23.8"],
2581          ["0.75346697377181959", "black", "黒", "0.983"],
2582          ["0.65130103496422487", "white", "白", "1.65"],
2583          ["1.5636943712879866e-111", "green", "緑", "0.0072"]];
2585     string[][] data3x6ExpectedWt3ProbsInorder =
2586         [["random_value", "field_a", "field_b", "field_c"],
2587          ["0.82728234682286661", "red", "赤", "23.8"],
2588          ["1.5636943712879866e-111", "green", "緑", "0.0072"],
2589          ["0.65130103496422487", "white", "白", "1.65"],
2590          ["0.99665198757645390", "yellow", "黄", "12"],
2591          ["0.94775884809836686", "blue", "青", "12"],
2592          ["0.75346697377181959", "black", "黒", "0.983"]];
2594     string[][] data3x6ExpectedPermuteWt3 =
2595         [["field_a", "field_b", "field_c"],
2596          ["yellow", "黄", "12"],
2597          ["blue", "青", "12"],
2598          ["red", "赤", "23.8"],
2599          ["black", "黒", "0.983"],
2600          ["white", "白", "1.65"],
2601          ["green", "緑", "0.0072"]];
2604     string[][] data3x6ExpectedReplaceNum10 =
2605         [["field_a", "field_b", "field_c"],
2606          ["black", "黒", "0.983"],
2607          ["green", "緑", "0.0072"],
2608          ["green", "緑", "0.0072"],
2609          ["red", "赤", "23.8"],
2610          ["yellow", "黄", "12"],
2611          ["red", "赤", "23.8"],
2612          ["white", "白", "1.65"],
2613          ["yellow", "黄", "12"],
2614          ["yellow", "黄", "12"],
2615          ["white", "白", "1.65"],
2616         ];
2618     string[][] data3x6ExpectedReplaceNum10V77 =
2619         [["field_a", "field_b", "field_c"],
2620          ["black", "黒", "0.983"],
2621          ["red", "赤", "23.8"],
2622          ["black", "黒", "0.983"],
2623          ["yellow", "黄", "12"],
2624          ["green", "緑", "0.0072"],
2625          ["green", "緑", "0.0072"],
2626          ["green", "緑", "0.0072"],
2627          ["yellow", "黄", "12"],
2628          ["blue", "青", "12"],
2629          ["white", "白", "1.65"],
2630         ];
2632     /* Using a different static seed. */
2633     string[][] data3x6ExpectedPermuteCompatV41Probs =
2634         [["random_value", "field_a", "field_b", "field_c"],
2635          ["0.68057272653095424", "green", "緑", "0.0072"],
2636          ["0.67681624367833138", "blue", "青", "12"],
2637          ["0.32097338931635022", "yellow", "黄", "12"],
2638          ["0.25092361867427826", "red", "赤", "23.8"],
2639          ["0.15535934292711318", "black", "黒", "0.983"],
2640          ["0.046095821075141430", "white", "白", "1.65"]];
2642     string[][] data3x6ExpectedBernoulliCompatP60V41Probs =
2643         [["random_value", "field_a", "field_b", "field_c"],
2644          ["0.25092361867427826", "red", "赤", "23.8"],
2645          ["0.046095821075141430", "white", "白", "1.65"],
2646          ["0.32097338931635022", "yellow", "黄", "12"],
2647          ["0.15535934292711318", "black", "黒", "0.983"]];
2649     string[][] data3x6ExpectedPermuteWt3V41Probs =
2650         [["random_value", "field_a", "field_b", "field_c"],
2651          ["0.96799377498910666", "blue", "青", "12"],
2652          ["0.94356245792573568", "red", "赤", "23.8"],
2653          ["0.90964601024271996", "yellow", "黄", "12"],
2654          ["0.15491658409260103", "white", "白", "1.65"],
2655          ["0.15043620392537033", "black", "黒", "0.983"],
2656          ["6.1394674830701461e-24", "green", "緑", "0.0072"]];
2658     string[][] data3x6ExpectedWt3V41ProbsInorder =
2659         [["random_value", "field_a", "field_b", "field_c"],
2660          ["0.94356245792573568", "red", "赤", "23.8"],
2661          ["6.1394674830701461e-24", "green", "緑", "0.0072"],
2662          ["0.15491658409260103", "white", "白", "1.65"],
2663          ["0.90964601024271996", "yellow", "黄", "12"],
2664          ["0.96799377498910666", "blue", "青", "12"],
2665          ["0.15043620392537033", "black", "黒", "0.983"]];
2668     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
2669     string[][] combo1ExpectedPermuteCompat =
2670         [["field_a", "field_b", "field_c"],
2671          ["yellow", "黄", "12"],
2672          ["tan", "タン", "8.5"],
2673          ["brown", "褐色", "29.2"],
2674          ["green", "緑", "0.0072"],
2675          ["red", "赤", "23.8"],
2676          ["purple", "紫の", "42"],
2677          ["black", "黒", "0.983"],
2678          ["white", "白", "1.65"],
2679          ["gray", "グレー", "6.2"],
2680          ["blue", "青", "12"],
2681          ["pink", "ピンク", "1.1"],
2682          ["orange", "オレンジ", "2.5"]];
2684     string[][] combo1ExpectedPermuteCompatProbs =
2685         [["random_value", "field_a", "field_b", "field_c"],
2686          ["0.97088520275428891", "yellow", "黄", "12"],
2687          ["0.96055546286515892", "tan", "タン", "8.5"],
2688          ["0.81756894313730299", "brown", "褐色", "29.2"],
2689          ["0.75710153928957880", "green", "緑", "0.0072"],
2690          ["0.52525980887003243", "red", "赤", "23.8"],
2691          ["0.49287854949943721", "purple", "紫の", "42"],
2692          ["0.47081507067196071", "black", "黒", "0.983"],
2693          ["0.38388182921335101", "white", "白", "1.65"],
2694          ["0.29215990612283349", "gray", "グレー", "6.2"],
2695          ["0.24033216014504433", "blue", "青", "12"],
2696          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2697          ["0.010968807619065046", "orange", "オレンジ", "2.5"]];
2699     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
2700     string[][] combo1ExpectedProbsInorder =
2701         [["random_value", "field_a", "field_b", "field_c"],
2702          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
2703          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2704          ["0.49287854949943721", "purple", "紫の", "42"],
2705          ["0.96055546286515892", "tan", "タン", "8.5"],
2706          ["0.52525980887003243", "red", "赤", "23.8"],
2707          ["0.75710153928957880", "green", "緑", "0.0072"],
2708          ["0.38388182921335101", "white", "白", "1.65"],
2709          ["0.97088520275428891", "yellow", "黄", "12"],
2710          ["0.24033216014504433", "blue", "青", "12"],
2711          ["0.47081507067196071", "black", "黒", "0.983"],
2712          ["0.81756894313730299", "brown", "褐色", "29.2"],
2713          ["0.29215990612283349", "gray", "グレー", "6.2"]];
2715     string[][] combo1ExpectedBernoulliCompatP50Probs =
2716         [["random_value", "field_a", "field_b", "field_c"],
2717          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
2718          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2719          ["0.49287854949943721", "purple", "紫の", "42"],
2720          ["0.38388182921335101", "white", "白", "1.65"],
2721          ["0.24033216014504433", "blue", "青", "12"],
2722          ["0.47081507067196071", "black", "黒", "0.983"],
2723          ["0.29215990612283349", "gray", "グレー", "6.2"]];
2725     string[][] combo1ExpectedBernoulliCompatP40 =
2726         [["field_a", "field_b", "field_c"],
2727          ["orange", "オレンジ", "2.5"],
2728          ["pink", "ピンク", "1.1"],
2729          ["white", "白", "1.65"],
2730          ["blue", "青", "12"],
2731          ["gray", "グレー", "6.2"]];
2733     string[][] combo1ExpectedDistinctK1P40 =
2734         [["field_a", "field_b", "field_c"],
2735          ["orange", "オレンジ", "2.5"],
2736          ["red", "赤", "23.8"],
2737          ["green", "緑", "0.0072"],
2738          ["blue", "青", "12"],
2739          ["black", "黒", "0.983"]];
2741     string[][] combo1ExpectedPermuteWt3Probs =
2742         [["random_value", "field_a", "field_b", "field_c"],
2743          ["0.99754077523718754", "yellow", "黄", "12"],
2744          ["0.99527665440088786", "tan", "タン", "8.5"],
2745          ["0.99312578945741659", "brown", "褐色", "29.2"],
2746          ["0.98329602553389361", "purple", "紫の", "42"],
2747          ["0.97330961938083660", "red", "赤", "23.8"],
2748          ["0.88797551521739648", "blue", "青", "12"],
2749          ["0.81999230489041786", "gray", "グレー", "6.2"],
2750          ["0.55975569204250941", "white", "白", "1.65"],
2751          ["0.46472135609205739", "black", "黒", "0.983"],
2752          ["0.18824582704191337", "pink", "ピンク", "1.1"],
2753          ["0.16446131853299920", "orange", "オレンジ", "2.5"],
2754          ["1.6438086931020549e-17", "green", "緑", "0.0072"]];
2756     string[][] combo1ExpectedPermuteWt3 =
2757         [["field_a", "field_b", "field_c"],
2758          ["yellow", "黄", "12"],
2759          ["tan", "タン", "8.5"],
2760          ["brown", "褐色", "29.2"],
2761          ["purple", "紫の", "42"],
2762          ["red", "赤", "23.8"],
2763          ["blue", "青", "12"],
2764          ["gray", "グレー", "6.2"],
2765          ["white", "白", "1.65"],
2766          ["black", "黒", "0.983"],
2767          ["pink", "ピンク", "1.1"],
2768          ["orange", "オレンジ", "2.5"],
2769          ["green", "緑", "0.0072"]];
2771         string[][] combo1ExpectedSampleAlgoRNum4 =
2772         [["field_a", "field_b", "field_c"],
2773          ["blue", "青", "12"],
2774          ["gray", "グレー", "6.2"],
2775          ["brown", "褐色", "29.2"],
2776          ["white", "白", "1.65"]];
2778         string[][] combo1ExpectedSampleAlgoRNum4Inorder =
2779         [["field_a", "field_b", "field_c"],
2780          ["white", "白", "1.65"],
2781          ["blue", "青", "12"],
2782          ["brown", "褐色", "29.2"],
2783          ["gray", "グレー", "6.2"]];
2785     string[][] combo1ExpectedReplaceNum10 =
2786         [["field_a", "field_b", "field_c"],
2787          ["gray", "グレー", "6.2"],
2788          ["yellow", "黄", "12"],
2789          ["yellow", "黄", "12"],
2790          ["white", "白", "1.65"],
2791          ["tan", "タン", "8.5"],
2792          ["white", "白", "1.65"],
2793          ["blue", "青", "12"],
2794          ["black", "黒", "0.983"],
2795          ["tan", "タン", "8.5"],
2796          ["purple", "紫の", "42"]];
2798     /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */
2799     string[][] data1x200 =
2800         [["field_a"],
2801          ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"],
2802          ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"],
2803          ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"],
2804          ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"],
2805          ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"],
2806          ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"],
2807          ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"],
2808          ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"],
2809          ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"],
2810          ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"],
2811          ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"],
2812          ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"],
2813          ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"],
2814          ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"],
2815          ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"],
2816          ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"],
2817          ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"],
2818          ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"],
2819          ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"],
2820          ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"],
2821         ];
2823     string fpath_data1x200 = buildPath(testDir, "data1x200.tsv");
2824     string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv");
2825     writeUnittestTsvFile(fpath_data1x200, data1x200);
2826     writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1 .. $]);
2828     string[][] data1x200ExpectedBernoulliSkipV333P01 =
2829         [["field_a"],
2830          ["077"],
2831          ["119"]];
2833     string[][] data1x200ExpectedBernoulliSkipV333P02 =
2834         [["field_a"],
2835          ["038"],
2836          ["059"],
2837          ["124"],
2838          ["161"],
2839          ["162"],
2840          ["183"]];
2842     string[][] data1x200ExpectedBernoulliSkipV333P03 =
2843         [["field_a"],
2844          ["025"],
2845          ["039"],
2846          ["082"],
2847          ["107"],
2848          ["108"],
2849          ["122"],
2850          ["136"],
2851          ["166"],
2852          ["182"]];
2854     string[][] data1x200ExpectedBernoulliCompatV333P01 =
2855         [["field_a"],
2856          ["072"]];
2858     string[][] data1x200ExpectedBernoulliCompatV333P02 =
2859         [["field_a"],
2860          ["004"],
2861          ["072"]];
2863     string[][] data1x200ExpectedBernoulliCompatV333P03 =
2864         [["field_a"],
2865          ["004"],
2866          ["072"],
2867          ["181"]];
2869     /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files,
2870      * only expected results. The header is from 3x0, the results are offset 1-position
2871      * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line.
2872      */
2873     string[][] combo2ExpectedBernoulliSkipV333P03 =
2874         [["field_a", "field_b", "field_c"],
2875          ["024"],
2876          ["038"],
2877          ["081"],
2878          ["106"],
2879          ["107"],
2880          ["121"],
2881          ["135"],
2882          ["165"],
2883          ["181"]];
2886     /* 1x10 - Simple 1-column file. */
2887     string[][] data1x10 =
2888         [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]];
2889     string fpath_data1x10 = buildPath(testDir, "data1x10.tsv");
2890     string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv");
2891     writeUnittestTsvFile(fpath_data1x10, data1x10);
2892     writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1 .. $]);
2894     string[][] data1x10ExpectedPermuteCompat =
2895         [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]];
2897     string[][] data1x10ExpectedPermuteWt1 =
2898         [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]];
2900     /* 2x10a - Uniform distribution [0,1]. */
2901     string[][] data2x10a =
2902         [["line", "weight"],
2903          ["1", "0.26788837"],
2904          ["2", "0.06601298"],
2905          ["3", "0.38627527"],
2906          ["4", "0.47379424"],
2907          ["5", "0.02966641"],
2908          ["6", "0.05636231"],
2909          ["7", "0.70529242"],
2910          ["8", "0.91836862"],
2911          ["9", "0.99103720"],
2912          ["10", "0.31401740"]];
2914     string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv");
2915     writeUnittestTsvFile(fpath_data2x10a, data2x10a);
2917     string[][] data2x10aExpectedPermuteWt2Probs =
2918         [["random_value", "line", "weight"],
2919          ["0.96833865494543658", "8", "0.91836862"],
2920          ["0.91856842054413923", "4", "0.47379424"],
2921          ["0.25730832087795091", "7", "0.70529242"],
2922          ["0.23725317907018120", "9", "0.99103720"],
2923          ["0.16016096701872204", "3", "0.38627527"],
2924          ["0.090819662667243381", "10", "0.31401740"],
2925          ["0.0071764539244361172", "6", "0.05636231"],
2926          ["0.000000048318642951630057", "1", "0.26788837"],
2927          ["0.00000000037525692966535517", "5", "0.02966641"],
2928          ["8.2123247880095796e-13", "2", "0.06601298"]];
2930     /* 2x10b - Uniform distribution [0,1000]. */
2931     string[][] data2x10b =
2932         [["line", "weight"],
2933          ["1", "761"],
2934          ["2", "432"],
2935          ["3", "103"],
2936          ["4", "448"],
2937          ["5", "750"],
2938          ["6", "711"],
2939          ["7", "867"],
2940          ["8", "841"],
2941          ["9", "963"],
2942          ["10", "784"]];
2944     string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv");
2945     writeUnittestTsvFile(fpath_data2x10b, data2x10b);
2947     string[][] data2x10bExpectedPermuteWt2Probs =
2948         [["random_value", "line", "weight"],
2949          ["0.99996486739067969", "8", "841"],
2950          ["0.99991017467137211", "4", "448"],
2951          ["0.99960871524873662", "6", "711"],
2952          ["0.99914188537143800", "5", "750"],
2953          ["0.99903963250274785", "10", "784"],
2954          ["0.99889631825931946", "7", "867"],
2955          ["0.99852058315191139", "9", "963"],
2956          ["0.99575669679158918", "2", "432"],
2957          ["0.99408758732050595", "1", "761"],
2958          ["0.99315467761212362", "3", "103"]];
2960     /* 2x10c - Logarithmic distribution in random order. */
2961     string[][] data2x10c =
2962         [["line", "weight"],
2963          ["1", "31.85"],
2964          ["2", "17403.31"],
2965          ["3", "653.84"],
2966          ["4", "8.23"],
2967          ["5", "2671.04"],
2968          ["6", "26226.08"],
2969          ["7", "1.79"],
2970          ["8", "354.56"],
2971          ["9", "35213.81"],
2972          ["10", "679.29"]];
2974     string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv");
2975     writeUnittestTsvFile(fpath_data2x10c, data2x10c);
2977     string[][] data2x10cExpectedPermuteWt2Probs =
2978         [["random_value", "line", "weight"],
2979          ["0.99998939008709697", "6", "26226.08"],
2980          ["0.99995951291695517", "9", "35213.81"],
2981          ["0.99991666907613541", "8", "354.56"],
2982          ["0.99989445052186410", "2", "17403.31"],
2983          ["0.99975897602861630", "5", "2671.04"],
2984          ["0.99891852769877643", "3", "653.84"],
2985          ["0.99889167752782515", "10", "679.29"],
2986          ["0.99512207506850148", "4", "8.23"],
2987          ["0.86789371584259023", "1", "31.85"],
2988          ["0.58574438162915610", "7", "1.79"]];
2990     /* 2x10d. Logarithmic distribution in ascending order. */
2991     string[][] data2x10d =
2992         [["line", "weight"],
2993          ["1", "1.79"],
2994          ["2", "8.23"],
2995          ["3", "31.85"],
2996          ["4", "354.56"],
2997          ["5", "653.84"],
2998          ["6", "679.29"],
2999          ["7", "2671.04"],
3000          ["8", "17403.31"],
3001          ["9", "26226.08"],
3002          ["10", "35213.81"]];
3004     string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv");
3005     writeUnittestTsvFile(fpath_data2x10d, data2x10d);
3007     string[][] data2x10dExpectedPermuteWt2Probs =
3008         [["random_value", "line", "weight"],
3009          ["0.99999830221846353", "8", "17403.31"],
3010          ["0.99997860834041397", "10", "35213.81"],
3011          ["0.99994563828986716", "9", "26226.08"],
3012          ["0.99988650363575737", "4", "354.56"],
3013          ["0.99964161939190088", "7", "2671.04"],
3014          ["0.99959045338948649", "6", "679.29"],
3015          ["0.99901574490639788", "5", "653.84"],
3016          ["0.97803163304747431", "3", "31.85"],
3017          ["0.79994791806910948", "2", "8.23"],
3018          ["0.080374261239949119", "1", "1.79"]];
3020     /* 2x10e. Logarithmic distribution in descending order. */
3021     string[][] data2x10e =
3022         [["line", "weight"],
3023          ["1", "35213.81"],
3024          ["2", "26226.08"],
3025          ["3", "17403.31"],
3026          ["4", "2671.04"],
3027          ["5", "679.29"],
3028          ["6", "653.84"],
3029          ["7", "354.56"],
3030          ["8", "31.85"],
3031          ["9", "8.23"],
3032          ["10", "1.79"]];
3033     string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv");
3034     writeUnittestTsvFile(fpath_data2x10e, data2x10e);
3036     string[][] data2x10eExpectedPermuteWt2Probs =
3037         [["random_value", "line", "weight"],
3038          ["0.99998493348975237", "4", "2671.04"],
3039          ["0.99995934807202624", "3", "17403.31"],
3040          ["0.99992995739727453", "2", "26226.08"],
3041          ["0.99987185679245649", "1", "35213.81"],
3042          ["0.99957451563173938", "6", "653.84"],
3043          ["0.99907273650209583", "8", "31.85"],
3044          ["0.99905260312968946", "5", "679.29"],
3045          ["0.99730333650516401", "7", "354.56"],
3046          ["0.84093902435227808", "9", "8.23"],
3047          ["0.65650015926290028", "10", "1.79"]];
3049     /* Data sets for distinct sampling. */
3050     string[][] data5x25 =
3051         [["ID", "Shape", "Color", "Size", "Weight"],
3052          ["01", "circle", "red", "S", "10"],
3053          ["02", "circle", "black", "L", "20"],
3054          ["03", "square", "black", "L", "20"],
3055          ["04", "circle", "green", "L", "30"],
3056          ["05", "ellipse", "red", "S", "20"],
3057          ["06", "triangle", "red", "S", "10"],
3058          ["07", "triangle", "red", "L", "20"],
3059          ["08", "square", "black", "S", "10"],
3060          ["09", "circle", "black", "S", "20"],
3061          ["10", "square", "green", "L", "20"],
3062          ["11", "triangle", "red", "L", "20"],
3063          ["12", "circle", "green", "L", "30"],
3064          ["13", "ellipse", "red", "S", "20"],
3065          ["14", "circle", "green", "L", "30"],
3066          ["15", "ellipse", "red", "L", "30"],
3067          ["16", "square", "red", "S", "10"],
3068          ["17", "circle", "black", "L", "20"],
3069          ["18", "square", "red", "S", "20"],
3070          ["19", "square", "black", "L", "20"],
3071          ["20", "circle", "red", "S", "10"],
3072          ["21", "ellipse", "black", "L", "30"],
3073          ["22", "triangle", "red", "L", "30"],
3074          ["23", "circle", "green", "S", "20"],
3075          ["24", "square", "green", "L", "20"],
3076          ["25", "circle", "red", "S", "10"],
3077         ];
3079     string fpath_data5x25 = buildPath(testDir, "data5x25.tsv");
3080     string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv");
3081     writeUnittestTsvFile(fpath_data5x25, data5x25);
3082     writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1 .. $]);
3084     string[][] data5x25ExpectedDistinctK2P40 =
3085         [["ID", "Shape", "Color", "Size", "Weight"],
3086          ["03", "square", "black", "L", "20"],
3087          ["05", "ellipse", "red", "S", "20"],
3088          ["08", "square", "black", "S", "10"],
3089          ["10", "square", "green", "L", "20"],
3090          ["13", "ellipse", "red", "S", "20"],
3091          ["15", "ellipse", "red", "L", "30"],
3092          ["16", "square", "red", "S", "10"],
3093          ["18", "square", "red", "S", "20"],
3094          ["19", "square", "black", "L", "20"],
3095          ["21", "ellipse", "black", "L", "30"],
3096          ["24", "square", "green", "L", "20"],
3097         ];
3099     string[][] data5x25ExpectedDistinctK2K4P20 =
3100         [["ID", "Shape", "Color", "Size", "Weight"],
3101          ["03", "square", "black", "L", "20"],
3102          ["07", "triangle", "red", "L", "20"],
3103          ["08", "square", "black", "S", "10"],
3104          ["10", "square", "green", "L", "20"],
3105          ["11", "triangle", "red", "L", "20"],
3106          ["16", "square", "red", "S", "10"],
3107          ["18", "square", "red", "S", "20"],
3108          ["19", "square", "black", "L", "20"],
3109          ["22", "triangle", "red", "L", "30"],
3110          ["24", "square", "green", "L", "20"],
3111         ];
3113     string[][] data5x25ExpectedDistinctK2K3K4P20 =
3114         [["ID", "Shape", "Color", "Size", "Weight"],
3115          ["04", "circle", "green", "L", "30"],
3116          ["07", "triangle", "red", "L", "20"],
3117          ["09", "circle", "black", "S", "20"],
3118          ["11", "triangle", "red", "L", "20"],
3119          ["12", "circle", "green", "L", "30"],
3120          ["14", "circle", "green", "L", "30"],
3121          ["16", "square", "red", "S", "10"],
3122          ["18", "square", "red", "S", "20"],
3123          ["22", "triangle", "red", "L", "30"],
3124         ];
3126     /* Fields 2 and 4 from data5x25. Distinct rows should be the same for equiv keys. */
3127     string[][] data2x25 =
3128         [["Shape", "Size"],
3129          ["circle", "S"],
3130          ["circle", "L"],
3131          ["square", "L"],
3132          ["circle", "L"],
3133          ["ellipse", "S"],
3134          ["triangle", "S"],
3135          ["triangle", "L"],
3136          ["square", "S"],
3137          ["circle", "S"],
3138          ["square", "L"],
3139          ["triangle", "L"],
3140          ["circle", "L"],
3141          ["ellipse", "S"],
3142          ["circle", "L"],
3143          ["ellipse", "L"],
3144          ["square", "S"],
3145          ["circle", "L"],
3146          ["square", "S"],
3147          ["square", "L"],
3148          ["circle", "S"],
3149          ["ellipse", "L"],
3150          ["triangle", "L"],
3151          ["circle", "S"],
3152          ["square", "L"],
3153          ["circle", "S"],
3154         ];
3156     string fpath_data2x25 = buildPath(testDir, "data2x25.tsv");
3157     string fpath_data2x25_noheader = buildPath(testDir, "data2x25_noheader.tsv");
3158     writeUnittestTsvFile(fpath_data2x25, data2x25);
3159     writeUnittestTsvFile(fpath_data2x25_noheader, data2x25[1 .. $]);
3161     string[][] data2x25ExpectedDistinctK1K2P20 =
3162         [["Shape", "Size"],
3163          ["square", "L"],
3164          ["triangle", "L"],
3165          ["square", "S"],
3166          ["square", "L"],
3167          ["triangle", "L"],
3168          ["square", "S"],
3169          ["square", "S"],
3170          ["square", "L"],
3171          ["triangle", "L"],
3172          ["square", "L"],
3173         ];
3175     string[][] data1x25 =
3176         [["Shape-Size"],
3177          ["circle-S"],
3178          ["circle-L"],
3179          ["square-L"],
3180          ["circle-L"],
3181          ["ellipse-S"],
3182          ["triangle-S"],
3183          ["triangle-L"],
3184          ["square-S"],
3185          ["circle-S"],
3186          ["square-L"],
3187          ["triangle-L"],
3188          ["circle-L"],
3189          ["ellipse-S"],
3190          ["circle-L"],
3191          ["ellipse-L"],
3192          ["square-S"],
3193          ["circle-L"],
3194          ["square-S"],
3195          ["square-L"],
3196          ["circle-S"],
3197          ["ellipse-L"],
3198          ["triangle-L"],
3199          ["circle-S"],
3200          ["square-L"],
3201          ["circle-S"],
3202         ];
3204     string fpath_data1x25 = buildPath(testDir, "data1x25.tsv");
3205     string fpath_data1x25_noheader = buildPath(testDir, "data1x25_noheader.tsv");
3206     writeUnittestTsvFile(fpath_data1x25, data1x25);
3207     writeUnittestTsvFile(fpath_data1x25_noheader, data1x25[1 .. $]);
3209     string[][] data1x25ExpectedDistinctK1P20 =
3210         [["Shape-Size"],
3211          ["triangle-L"],
3212          ["square-S"],
3213          ["triangle-L"],
3214          ["ellipse-L"],
3215          ["square-S"],
3216          ["square-S"],
3217          ["ellipse-L"],
3218          ["triangle-L"],
3219         ];
3221     string[][] data1x25ExpectedDistinctK1P20Probs =
3222         [["random_value", "Shape-Size"],
3223          ["0", "triangle-L"],
3224          ["0", "square-S"],
3225          ["0", "triangle-L"],
3226          ["0", "ellipse-L"],
3227          ["0", "square-S"],
3228          ["0", "square-S"],
3229          ["0", "ellipse-L"],
3230          ["0", "triangle-L"],
3231         ];
3233     string[][] data1x25ExpectedDistinctK1P20ProbsInorder =
3234         [["random_value", "Shape-Size"],
3235          ["1", "circle-S"],
3236          ["4", "circle-L"],
3237          ["2", "square-L"],
3238          ["4", "circle-L"],
3239          ["2", "ellipse-S"],
3240          ["1", "triangle-S"],
3241          ["0", "triangle-L"],
3242          ["0", "square-S"],
3243          ["1", "circle-S"],
3244          ["2", "square-L"],
3245          ["0", "triangle-L"],
3246          ["4", "circle-L"],
3247          ["2", "ellipse-S"],
3248          ["4", "circle-L"],
3249          ["0", "ellipse-L"],
3250          ["0", "square-S"],
3251          ["4", "circle-L"],
3252          ["0", "square-S"],
3253          ["2", "square-L"],
3254          ["1", "circle-S"],
3255          ["0", "ellipse-L"],
3256          ["0", "triangle-L"],
3257          ["1", "circle-S"],
3258          ["2", "square-L"],
3259          ["1", "circle-S"],
3260         ];
3262     /*
3263      * Enough setup! Actually run some tests!
3264      */
3266     /* Shuffling tests. Headers, static seed, compatibility mode. With weights and without. */
3267     testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty);
3268     testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0);
3269     testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1);
3270     testTsvSample(["test-a4", "-H", "-s", "--compatibility-mode", fpath_data3x2], data3x2PermuteCompat);
3271     testTsvSample(["test-a5", "-H", "-s", "--compatibility-mode", fpath_data3x3], data3x3ExpectedPermuteCompat);
3272     testTsvSample(["test-a6", "-H", "-s", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat);
3273     testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
3274     testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3);
3275     testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
3276     testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
3277     testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
3278     testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
3279     testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs);
3281     /* Shuffling, without compatibility mode, or with both compatibility and printing. */
3282     testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty);
3283     testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0);
3284     testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1);
3285     testTsvSample(["test-aa4", "-H", "-s", fpath_data3x2], data3x2PermuteShuffle);
3286     testTsvSample(["test-aa5", "-H", "-s", fpath_data3x3], data3x3ExpectedPermuteSwap);
3287     testTsvSample(["test-aa6", "-H", "-s", fpath_data3x6], data3x6ExpectedPermuteSwap);
3288     testTsvSample(["test-aa7", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3);
3289     testTsvSample(["test-aa8", "-H", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
3290     testTsvSample(["test-aa9", "-H", "--seed-value", "41", "--print-random", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
3292     /* Reservoir sampling using Algorithm R.
3293      * (Note: reservoirSamplingViaHeap is tested later in the length-based iteration loops.)
3294      */
3295     testTsvSample(["test-aa10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty);
3296     testTsvSample(["test-aa11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty);
3297     testTsvSample(["test-aa12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x0], data3x0);
3298     testTsvSample(["test-aa13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x0], data3x0);
3299     testTsvSample(["test-aa14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x1], data3x1);
3300     testTsvSample(["test-aa15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x1], data3x1);
3301     testTsvSample(["test-aa16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6);
3302     testTsvSample(["test-aa17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6);
3303     testTsvSample(["test-aa18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5);
3304     testTsvSample(["test-aa19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4);
3305     testTsvSample(["test-aa20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3);
3306     testTsvSample(["test-aa21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2);
3307     testTsvSample(["test-aa22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1);
3309     /* Inorder versions of Algorithm R tests. */
3310     testTsvSample(["test-ai10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty);
3311     testTsvSample(["test-ai11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty);
3312     testTsvSample(["test-ai12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0);
3313     testTsvSample(["test-ai13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0);
3314     testTsvSample(["test-ai14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1);
3315     testTsvSample(["test-ai15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1);
3316     testTsvSample(["test-ai16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder);
3317     testTsvSample(["test-ai17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder);
3318     testTsvSample(["test-ai18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5Inorder);
3319     testTsvSample(["test-ai19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4Inorder);
3320     testTsvSample(["test-ai20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3Inorder);
3321     testTsvSample(["test-ai21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2Inorder);
3322     testTsvSample(["test-ai22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1Inorder);
3324     /* Bernoulli sampling cases. */
3325     testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty);
3326     testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0);
3327     testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1);
3328     testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6);
3329     testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6);
3330     testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
3331     testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60);
3332     testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliCompatP60);
3333     testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs);
3335     /* Bernoulli sampling with probabilities in skip sampling range or preferring skip sampling. */
3336     testTsvSample(["test-ab1", "-H", "--seed-value", "333", "--prob", "0.01", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P01);
3337     testTsvSample(["test-ab2", "-H", "--seed-value", "333", "--prob", "0.02", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P02);
3338     testTsvSample(["test-ab3", "-H", "--seed-value", "333", "--prob", "0.03", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03);
3339     testTsvSample(["test-ab4", "-H", "--seed-value", "333", "--prob", "0.01", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P01);
3340     testTsvSample(["test-ab5", "-H", "--seed-value", "333", "--prob", "0.02", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P02);
3341     testTsvSample(["test-ab6", "-H", "--seed-value", "333", "--prob", "0.03", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P03);
3342     testTsvSample(["test-ab7", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40);
3344     /* Distinct sampling cases. */
3345     testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty);
3346     testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0);
3347     testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1);
3348     testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6);
3349     testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctK1K3P60);
3352     /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling.
3353      * For weighted sampling, use the weighted cases, but with expected using the original ordering.
3354      */
3355     testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
3356     testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
3357     testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
3358                   data3x6ExpectedWt3ProbsInorder);
3359     testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
3360                   data3x6ExpectedWt3V41ProbsInorder);
3361     testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6],
3362                   data3x6ExpectedDistinctK1K3P60Probs);
3363     testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header",
3364                    "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctK1K3P60ProbsRVCustom);
3365     testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6],
3366                   data3x6ExpectedDistinctK2P2ProbsInorder);
3368     /* Simple random sampling with replacement. */
3369     testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty);
3370     testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty);
3371     testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0);
3372     testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0);
3373     testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplaceNum3);
3374     testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10);
3375     testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77);
3377     /* Shuffling, compatibility mode, without headers. */
3378     testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1 .. $]);
3379     testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1 .. $]);
3380     testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1 .. $]);
3381     testTsvSample(["test-b4", "-s", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1 .. $]);
3382     testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1 .. $]);
3383     testTsvSample(["test-b6", "-s", "--weight-field", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]);
3384     testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]);
3385     testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]);
3386     testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1 .. $]);
3388     /* Shuffling, no headers, without compatibility mode, or with printing and compatibility mode. */
3389     testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1 .. $]);
3390     testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1 .. $]);
3391     testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1 .. $]);
3392     testTsvSample(["test-bb4", "-s", fpath_data3x6_noheader], data3x6ExpectedPermuteSwap[1 .. $]);
3393     testTsvSample(["test-bb5", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]);
3394     testTsvSample(["test-bb6", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]);
3395     testTsvSample(["test-bb7", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]);
3397     /* Reservoir sampling using Algorithm R, no headers. */
3398     testTsvSample(["test-ac10", "--prefer-algorithm-r", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty);
3399     testTsvSample(["test-ac11", "--prefer-algorithm-r", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty);
3400     testTsvSample(["test-ac14", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x1_noheader], data3x1[1 .. $]);
3401     testTsvSample(["test-ac15", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x1_noheader], data3x1[1 .. $]);
3402     testTsvSample(["test-ac16", "--prefer-algorithm-r", "-s", "--num", "7", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]);
3403     testTsvSample(["test-ac17", "--prefer-algorithm-r", "-s", "--num", "6", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]);
3404     testTsvSample(["test-ac18", "--prefer-algorithm-r", "-s", "--num", "5", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5[1 .. $]);
3405     testTsvSample(["test-ac19", "--prefer-algorithm-r", "-s", "--num", "4", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4[1 .. $]);
3406     testTsvSample(["test-ac20", "--prefer-algorithm-r", "-s", "--num", "3", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3[1 .. $]);
3407     testTsvSample(["test-ac21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2[1 .. $]);
3408     testTsvSample(["test-ac22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1[1 .. $]);
3410     /* Reservoir sampling using Algorithm R, no headers, inorder output. */
3411     testTsvSample(["test-aj10", "--prefer-algorithm-r", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty);
3412     testTsvSample(["test-aj11", "--prefer-algorithm-r", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty);
3413     testTsvSample(["test-aj14", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3414     testTsvSample(["test-aj15", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3415     testTsvSample(["test-aj16", "--prefer-algorithm-r", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]);
3416     testTsvSample(["test-aj17", "--prefer-algorithm-r", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]);
3417     testTsvSample(["test-aj18", "--prefer-algorithm-r", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5Inorder[1 .. $]);
3418     testTsvSample(["test-aj19", "--prefer-algorithm-r", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4Inorder[1 .. $]);
3419     testTsvSample(["test-aj20", "--prefer-algorithm-r", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3Inorder[1 .. $]);
3420     testTsvSample(["test-aj21", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2Inorder[1 .. $]);
3421     testTsvSample(["test-aj22", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1Inorder[1 .. $]);
3423     /* Bernoulli sampling cases. */
3424     testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1 .. $]);
3425     testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]);
3426     testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]);
3427     testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]);
3428     testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1 .. $]);
3429     testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1 .. $]);
3431     /* Bernoulli sampling with probabilities in skip sampling range. */
3432     testTsvSample(["test-bb1", "-v", "333", "-p", "0.01", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P01[1 .. $]);
3433     testTsvSample(["test-bb2", "-v", "333", "-p", "0.02", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P02[1 .. $]);
3434     testTsvSample(["test-bb3", "-v", "333", "-p", "0.03", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1 .. $]);
3435     testTsvSample(["test-bb4", "-v", "333", "-p", "0.01", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P01[1 .. $]);
3436     testTsvSample(["test-bb5", "-v", "333", "-p", "0.02", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P02[1 .. $]);
3437     testTsvSample(["test-bb6", "-v", "333", "-p", "0.03", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P03[1 .. $]);
3438     testTsvSample(["test-bb7", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1 .. $]);
3440     /* Distinct sampling cases. */
3441     testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1 .. $]);
3442     testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]);
3443     testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]);
3444     testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]);
3446     /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */
3447     testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]);
3448     testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1 .. $]);
3449     testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader],
3450                   data3x6ExpectedDistinctK1K3P60Probs[1 .. $]);
3451     testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader],
3452                   data3x6ExpectedDistinctK2P2ProbsInorder[1 .. $]);
3454     /* Simple random sampling with replacement. */
3455     testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty);
3456     testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty);
3457     testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplaceNum3[1 .. $]);
3458     testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10[1 .. $]);
3459     testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1 .. $]);
3461     /* Multi-file tests. */
3462     testTsvSample(["test-c1", "--header", "--static-seed", "--compatibility-mode",
3463                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3464                   combo1ExpectedPermuteCompat);
3465     testTsvSample(["test-c2", "--header", "--static-seed", "--print-random",
3466                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3467                   combo1ExpectedPermuteCompatProbs);
3468     testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3",
3469                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3470                   combo1ExpectedPermuteWt3Probs);
3471     testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", "--compatibility-mode",
3472                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3473                   combo1ExpectedPermuteWt3);
3474     testTsvSample(["test-c5", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4",
3475                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3476                   combo1ExpectedSampleAlgoRNum4);
3477     testTsvSample(["test-c5b", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder",
3478                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3479                   combo1ExpectedSampleAlgoRNum4Inorder);
3481     /* Multi-file, no headers. */
3482     testTsvSample(["test-c6", "--static-seed", "--compatibility-mode",
3483                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3484                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3485                   combo1ExpectedPermuteCompat[1 .. $]);
3486     testTsvSample(["test-c7", "--static-seed", "--print-random",
3487                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3488                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3489                   combo1ExpectedPermuteCompatProbs[1 .. $]);
3490     testTsvSample(["test-c8", "--static-seed", "--print-random", "--weight-field", "3",
3491                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3492                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3493                   combo1ExpectedPermuteWt3Probs[1 .. $]);
3494     testTsvSample(["test-c9", "--static-seed", "--weight-field", "3", "--compatibility-mode",
3495                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3496                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3497                   combo1ExpectedPermuteWt3[1 .. $]);
3498     testTsvSample(["test-c10", "--static-seed", "--prefer-algorithm-r", "--num", "4",
3499                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3500                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3501                   combo1ExpectedSampleAlgoRNum4[1 .. $]);
3502     testTsvSample(["test-c10b", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder",
3503                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3504                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3505                   combo1ExpectedSampleAlgoRNum4Inorder[1 .. $]);
3507     /* Bernoulli sampling cases. */
3508     testTsvSample(["test-c11", "--header", "--static-seed", "--print-random", "--prob", ".5",
3509                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3510                   combo1ExpectedBernoulliCompatP50Probs);
3511     testTsvSample(["test-c12", "--header", "--static-seed", "--prob", ".4",
3512                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3513                   combo1ExpectedBernoulliCompatP40);
3514     testTsvSample(["test-c13", "--static-seed", "--print-random", "--prob", ".5",
3515                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3516                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3517                   combo1ExpectedBernoulliCompatP50Probs[1 .. $]);
3518     testTsvSample(["test-c14", "--static-seed", "--prob", ".4",
3519                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3520                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3521                   combo1ExpectedBernoulliCompatP40[1 .. $]);
3523     /* Bernoulli sampling with probabilities in skip sampling range. */
3524     testTsvSample(["test-cc1", "-H", "-v", "333", "-p", "0.03",
3525                    fpath_data3x0, fpath_data3x1, fpath_data1x200, fpath_dataEmpty, fpath_data1x10],
3526                   combo2ExpectedBernoulliSkipV333P03);
3527     testTsvSample(["test-cc1", "-v", "333", "-p", "0.03",
3528                    fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader],
3529                   combo2ExpectedBernoulliSkipV333P03[1 .. $]);
3531     /* Distinct sampling cases. */
3532     testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4",
3533                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3534                   combo1ExpectedDistinctK1P40);
3535     testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--prob", ".4",
3536                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3537                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3538                   combo1ExpectedDistinctK1P40[1 .. $]);
3540     /* Generating random weights. */
3541     testTsvSample(["test-c15", "--header", "--static-seed", "--gen-random-inorder",
3542                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3543                   combo1ExpectedProbsInorder);
3544     testTsvSample(["test-c16", "--static-seed", "--gen-random-inorder",
3545                    fpath_data3x3_noheader, fpath_data3x1_noheader,
3546                    fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader],
3547                   combo1ExpectedProbsInorder[1 .. $]);
3549     /* Simple random sampling with replacement. */
3550     testTsvSample(["test-c17", "--header", "--static-seed", "--replace", "--num", "10",
3551                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3552                   combo1ExpectedReplaceNum10);
3554     testTsvSample(["test-c18", "--static-seed", "--replace", "--num", "10",
3555                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3556                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3557                   combo1ExpectedReplaceNum10[1 .. $]);
3559     /* Single column file. */
3560     testTsvSample(["test-d1", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat);
3561     testTsvSample(["test-d2", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat);
3563     /* Distributions. */
3564     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs);
3565     testTsvSample(["test-e2", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedPermuteWt2Probs);
3566     testTsvSample(["test-e3", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedPermuteWt2Probs);
3567     testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs);
3568     testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs);
3570     /* Tests of subset sample (--n|num) field. Random sampling, Bernoulli sampling, distinct sampling.
3571      *
3572      * Note: The way these tests are done ensures that subset length does not affect
3573      * output order.
3574      */
3575     import std.algorithm : min;
3576     for (size_t n = data3x6.length + 2; n >= 1; n--)
3577     {
3578         /* reservoirSamplingViaHeap.
3579          */
3580         size_t expectedLength = min(data3x6.length, n + 1);
3581         testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string,
3582                        "-H", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]);
3584         testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string,
3585                        "-H", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]);
3587         testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string,
3588                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs[0..expectedLength]);
3590         testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string,
3591                        "-H", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3[0..expectedLength]);
3593         testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string,
3594                        "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs[0..expectedLength]);
3596         testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string,
3597                        fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..expectedLength]);
3599         testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string,
3600                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..expectedLength]);
3602         testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string,
3603                        "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..expectedLength]);
3605         testTsvSample([format("test-f9_%d", n), "-s", "-n", n.to!string,
3606                        "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..expectedLength]);
3608         /* Bernoulli sampling.
3609          */
3610         import std.algorithm : min;
3611         size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedBernoulliCompatProbsP60.length);
3613         testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3614                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60[0..sampleExpectedLength]);
3616         testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3617                        "-H", fpath_data3x6], data3x6ExpectedBernoulliCompatP60[0..sampleExpectedLength]);
3619         testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3620                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..sampleExpectedLength]);
3622         testTsvSample([format("test-f13_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3623                        fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60[1..sampleExpectedLength]);
3625         /* Distinct Sampling.
3626          */
3627         size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctK1K3P60.length);
3629         testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
3630                        "-H", fpath_data3x6], data3x6ExpectedDistinctK1K3P60[0..distinctExpectedLength]);
3632         testTsvSample([format("test-f15_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
3633                        fpath_data3x6_noheader], data3x6ExpectedDistinctK1K3P60[1..distinctExpectedLength]);
3635         testTsvSample([format("test-f16_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
3636                        "-H", fpath_data3x6], data3x6ExpectedBernoulliProbsP100[0..expectedLength]);
3638         testTsvSample([format("test-f17_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
3639                        fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]);
3640     }
3642     /* Similar tests with the 1x10 data set. */
3643     for (size_t n = data1x10.length + 2; n >= 1; n--)
3644     {
3645         size_t expectedLength = min(data1x10.length, n + 1);
3646         testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string,
3647                        "-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]);
3649         testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string,
3650                        "-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]);
3652         testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string,
3653                        fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]);
3655         testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string,
3656                        "-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]);
3657     }
3659     /* Simple random sampling with replacement: ensure sample size doesn't change order. */
3660     for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--)
3661     {
3662         testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6],
3663                       data3x6ExpectedReplaceNum10[0 .. n + 1]);
3665         testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader],
3666                       data3x6ExpectedReplaceNum10[1 .. n + 1]);
3667     }
3669     /* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */
3670     for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--)
3671     {
3672         size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1);
3674         testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
3675                        "-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]);
3677         testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
3678                        fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]);
3679     }
3681     /* Inorder sampling tests using reservoir sampling via heap (compatibility mode). */
3682     testTsvSample(["test-ar10", "--compatibility-mode", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty);
3683     testTsvSample(["test-ar11", "--compatibility-mode", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty);
3684     testTsvSample(["test-ar12", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0);
3685     testTsvSample(["test-ar13", "--compatibility-mode", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0);
3686     testTsvSample(["test-ar14", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1);
3687     testTsvSample(["test-ar15", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1);
3688     testTsvSample(["test-ar16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder);
3689     testTsvSample(["test-ar17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder);
3690     testTsvSample(["test-ar18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum5Inorder);
3691     testTsvSample(["test-ar19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", fpath_data3x6],         data3x6ExpectedSampleCompatNum4Inorder);
3692     testTsvSample(["test-ar20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum3Inorder);
3693     testTsvSample(["test-ar21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum2Inorder);
3694     testTsvSample(["test-ar22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum1Inorder);
3696     testTsvSample(["test-as10", "--compatibility-mode", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty);
3697     testTsvSample(["test-as11", "--compatibility-mode", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty);
3698     testTsvSample(["test-as14", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3699     testTsvSample(["test-as15", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3700     testTsvSample(["test-as16", "--compatibility-mode", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]);
3701     testTsvSample(["test-as17", "--compatibility-mode", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]);
3702     testTsvSample(["test-as18", "--compatibility-mode", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5Inorder[1 .. $]);
3703     testTsvSample(["test-as19", "--compatibility-mode", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4Inorder[1 .. $]);
3704     testTsvSample(["test-as20", "--compatibility-mode", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3Inorder[1 .. $]);
3705     testTsvSample(["test-as21", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2Inorder[1 .. $]);
3706     testTsvSample(["test-as22", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1Inorder[1 .. $]);
3708     /* Inorder sampling tests with random number printing. --compatibility-mode not needed. */
3709     testTsvSample(["test-at16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder);
3710     testTsvSample(["test-at17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder);
3711     testTsvSample(["test-at18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum5ProbsInorder);
3712     testTsvSample(["test-at19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder);
3713     testTsvSample(["test-at19",                         "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder);
3714     testTsvSample(["test-at20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder);
3715     testTsvSample(["test-at20",                         "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder);
3716     testTsvSample(["test-at21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum2ProbsInorder);
3717     testTsvSample(["test-at22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum1ProbsInorder);
3719     testTsvSample(["test-au16", "--compatibility-mode", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]);
3720     testTsvSample(["test-au17", "--compatibility-mode", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]);
3721     testTsvSample(["test-au18", "--compatibility-mode", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5ProbsInorder[1 .. $]);
3722     testTsvSample(["test-au19", "--compatibility-mode", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]);
3723     testTsvSample(["test-au19",                         "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]);
3724     testTsvSample(["test-au20", "--compatibility-mode", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3ProbsInorder[1 .. $]);
3725     testTsvSample(["test-au21", "--compatibility-mode", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2ProbsInorder[1 .. $]);
3726     testTsvSample(["test-au22", "--compatibility-mode", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1ProbsInorder[1 .. $]);
3728     /* Inorder weighted sampling tests. */
3729     testTsvSample(["test-ax16", "-H", "-s", "-n", "7", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder);
3730     testTsvSample(["test-ax17", "-H", "-s", "-n", "6", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder);
3731     testTsvSample(["test-ax18", "-H", "-s", "-n", "5", "-i", fpath_data3x6], data3x6ExpectedWt3Num5Inorder);
3732     testTsvSample(["test-ax19", "-H", "-s", "-n", "4", "-i", fpath_data3x6], data3x6ExpectedWt3Num4Inorder);
3733     testTsvSample(["test-ax20", "-H", "-s", "-n", "3", "-i", fpath_data3x6], data3x6ExpectedWt3Num3Inorder);
3734     testTsvSample(["test-ax21", "-H", "-s", "-n", "2", "-i", fpath_data3x6], data3x6ExpectedWt3Num2Inorder);
3735     testTsvSample(["test-ax22", "-H", "-s", "-n", "1", "-i", fpath_data3x6], data3x6ExpectedWt3Num1Inorder);
3737     testTsvSample(["test-ay16", "-s", "-n", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]);
3738     testTsvSample(["test-ay17", "-s", "-n", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]);
3739     testTsvSample(["test-ay18", "-s", "-n", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num5Inorder[1 .. $]);
3740     testTsvSample(["test-ay19", "-s", "-n", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num4Inorder[1 .. $]);
3741     testTsvSample(["test-ay20", "-s", "-n", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num3Inorder[1 .. $]);
3742     testTsvSample(["test-ay21", "-s", "-n", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num2Inorder[1 .. $]);
3743     testTsvSample(["test-ay22", "-s", "-n", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num1Inorder[1 .. $]);
3745     /*
3746      * Distinct sampling tests.
3747      */
3748     testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25],
3749                   data5x25ExpectedDistinctK2P40);
3751     testTsvSample(["test-j2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25],
3752                   data5x25ExpectedDistinctK2K4P20);
3754     testTsvSample(["test-j3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25],
3755                   data5x25ExpectedDistinctK2K3K4P20);
3757     testTsvSample(["test-j4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader],
3758                   data5x25ExpectedDistinctK2P40[1 .. $]);
3760     testTsvSample(["test-j5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader],
3761                   data5x25ExpectedDistinctK2K4P20[1 .. $]);
3763     testTsvSample(["test-j6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader],
3764                   data5x25ExpectedDistinctK2K3K4P20[1 .. $]);
3767     /* These distinct tests check that the whole line as '-k 0' and specifying all fields
3768      * in order have the same result. Also that field numbers don't matter, as '-k 1,2'
3769      * in data2x25 are the same keys as '-k 2,4' in data5x25.
3770      */
3771     testTsvSample(["test-j7", "-H", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25],
3772                   data2x25ExpectedDistinctK1K2P20);
3774     testTsvSample(["test-j8", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data2x25],
3775                   data2x25ExpectedDistinctK1K2P20);
3777     testTsvSample(["test-j9", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25_noheader],
3778                   data2x25ExpectedDistinctK1K2P20[1 .. $]);
3780     testTsvSample(["test-j10", "-s", "-p", "0.20", "-k", "0", fpath_data2x25_noheader],
3781                   data2x25ExpectedDistinctK1K2P20[1 .. $]);
3783     /* Similar to the last set, but for a 1-column file. Also with random value printing. */
3784     testTsvSample(["test-j11", "-H", "-s", "-p", "0.20", "-k", "1", fpath_data1x25],
3785                   data1x25ExpectedDistinctK1P20);
3787     testTsvSample(["test-j12", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data1x25],
3788                   data1x25ExpectedDistinctK1P20);
3790     testTsvSample(["test-j13", "-s", "-p", "0.20", "-k", "1", fpath_data1x25_noheader],
3791                   data1x25ExpectedDistinctK1P20[1 .. $]);
3793     testTsvSample(["test-j14", "-s", "-p", "0.20", "-k", "0", fpath_data1x25_noheader],
3794                   data1x25ExpectedDistinctK1P20[1 .. $]);
3797     testTsvSample(["test-j15", "-H", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25],
3798                   data1x25ExpectedDistinctK1P20Probs);
3800     testTsvSample(["test-j16", "-H", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25],
3801                   data1x25ExpectedDistinctK1P20Probs);
3803     testTsvSample(["test-j17", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25_noheader],
3804                   data1x25ExpectedDistinctK1P20Probs[1 .. $]);
3806     testTsvSample(["test-j18", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25_noheader],
3807                   data1x25ExpectedDistinctK1P20Probs[1 .. $]);
3810     testTsvSample(["test-j19", "-H", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25],
3811                   data1x25ExpectedDistinctK1P20ProbsInorder);
3813     testTsvSample(["test-j20", "-H", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25],
3814                   data1x25ExpectedDistinctK1P20ProbsInorder);
3816     testTsvSample(["test-j21", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25_noheader],
3817                   data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]);
3819     testTsvSample(["test-j22", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25_noheader],
3820                   data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]);
3822 }