1 /**
2 Command line tool for shuffling or sampling lines from input streams. Several methods
3 are available, including weighted and unweighted shuffling, simple and weighted random
4 sampling, sampling with replacement, Bernoulli sampling, and distinct sampling.
5 
6 Copyright (c) 2017-2021, eBay Inc.
7 Initially written by Jon Degenhardt
8 
9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
10 */
11 module tsv_utils.tsv_sample;
12 
13 import std.array : appender, Appender, RefAppender;
14 import std.exception : enforce;
15 import std.format : format;
16 import std.range;
17 import std.stdio;
18 import std.typecons : tuple, Flag;
19 
20 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
21 
22 version(unittest)
23 {
24     // When running unit tests, use main from -main compiler switch.
25 }
26 else
27 {
28     /** Main program.
29      *
30      * Invokes command line argument processing and calls tsvSample to do the real
31      * work. Errors occurring during processing are caught and reported to the user.
32      */
33     int main(string[] cmdArgs)
34     {
35         import tsv_utils.common.utils : BufferedOutputRange, LineBuffered;
36 
37         /* When running in DMD code coverage mode, turn on report merging. */
38         version(D_Coverage) version(DigitalMars)
39         {
40             import core.runtime : dmd_coverSetMerge;
41             dmd_coverSetMerge(true);
42         }
43 
44         TsvSampleOptions cmdopt;
45         const r = cmdopt.processArgs(cmdArgs);
46         if (!r[0]) return r[1];
47         version(LDC_Profile)
48         {
49             import ldc.profile : resetAll;
50             resetAll();
51         }
52 
53         immutable LineBuffered linebuffered = cmdopt.lineBuffered ? Yes.lineBuffered : No.lineBuffered;
54 
55         try tsvSample(cmdopt, BufferedOutputRange!(typeof(stdout))(stdout, linebuffered));
56         catch (Exception exc)
57         {
58             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
59             return 1;
60         }
61         return 0;
62     }
63 }
64 
65 immutable helpText = q"EOS
66 Synopsis: tsv-sample [options] [file...]
67 
68 Sample input lines or randomize their order. Several modes of operation
69 are available:
70 * Shuffling (the default): All input lines are output in random order. All
71   orderings are equally likely.
72 * Random sampling (--n|num N): A random sample of N lines are selected and
73   written to standard output. By default, selected lines are written in
74   random order. All sample sets and orderings are equally likely. Use
75   --i|inorder to write the selected lines in the original input order.
76 * Weighted random sampling (--n|num N, --w|weight-field F): A weighted
77   sample of N lines is produced. Weights are taken from field F. Lines are
78   output in weighted selection order. Use --i|inorder to write in original
79   input order. Omit --n|num to shuffle all lines (weighted shuffling).
80 * Sampling with replacement (--r|replace, --n|num N): All input lines are
81   read in, then lines are repeatedly selected at random and written out.
82   This continues until N lines are output. Individual lines can be written
83   multiple times. Output continues forever if N is zero or not provided.
84 * Bernoulli sampling (--p|prob P): A random subset of lines is selected
85   based on probability P, a 0.0-1.0 value. This is a streaming operation.
86   A decision is made on each line as it is read. Line order is not changed.
87 * Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled
88   based on the values in the key fields. A subset of keys are chosen based
89   on the inclusion probability (a 'distinct' set of keys). All lines with
90   one of the selected keys are output. Line order is not changed.
91 
92 Fields are specified using field number or field name. Field names require
93 that the input file has a header line.
94 
95 Use '--help-verbose' for detailed information.
96 
97 Options:
98 EOS";
99 
100 immutable helpTextVerbose = q"EOS
101 Synopsis: tsv-sample [options] [file...]
102 
103 Sample input lines or randomize their order. Several modes of operation
104 are available:
105 * Shuffling (the default): All input lines are output in random order. All
106   orderings are equally likely.
107 * Random sampling (--n|num N): A random sample of N lines are selected and
108   written to standard output. By default, selected lines are written in
109   random order. All sample sets and orderings are equally likely. Use
110   --i|inorder to write the selected lines in the original input order.
111 * Weighted random sampling (--n|num N, --w|weight-field F): A weighted
112   sample of N lines is produced. Weights are taken from field F. Lines are
113   output in weighted selection order. Use --i|inorder to write in original
114   input order. Omit --n|num to shuffle all lines (weighted shuffling).
115 * Sampling with replacement (--r|replace, --n|num N): All input lines are
116   read in, then lines are repeatedly selected at random and written out.
117   This continues until N lines are output. Individual lines can be written
118   multiple times. Output continues forever if N is zero or not provided.
119 * Bernoulli sampling (--p|prob P): A random subset of lines is selected
120   based on probability P, a 0.0-1.0 value. This is a streaming operation.
121   A decision is made on each line as it is read. Line order is not changed.
122 * Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled
123   based on the values in the key fields. A subset of keys are chosen based
124   on the inclusion probability (a 'distinct' set of keys). All lines with
125   one of the selected keys are output. Line order is not changed.
126 
127 Fields: Fields are specified by field number or name. Field names require
128 the input file to have a header line. Use '--help-fields' for details.
129 
130 Sample size: The '--n|num' option controls the sample size for all
131 sampling methods. In the case of simple and weighted random sampling it
132 also limits the amount of memory required.
133 
134 Controlling the random seed: By default, each run produces a different
135 randomization or sampling. Using '--s|static-seed' changes this so
136 multiple runs produce the same results. This works by using the same
137 random seed each run. The random seed can be specified using
138 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero
139 value is a no-op and ignored.)
140 
141 Memory use: Bernoulli sampling and distinct sampling make decisions on
142 each line as it is read, there is no memory accumulation. These algorithms
143 can run on arbitrary size inputs. Sampling with replacement reads all
144 lines into memory and is limited by available memory. Shuffling also reads
145 all lines into memory and is similarly limited. Random sampling uses
146 reservoir sampling, and only needs to hold the sample size (--n|num) in
147 memory. The input data can be of any length.
148 
149 Weighted sampling: Weighted random sampling is done using an algorithm
150 described by Pavlos Efraimidis and Paul Spirakis. Weights should be
151 positive values representing the relative weight of the entry in the
152 collection. Counts and similar can be used as weights, it is *not*
153 necessary to normalize to a [0,1] interval. Negative values are not
154 meaningful and given the value zero. Input order is not retained, instead
155 lines are output ordered by the randomized weight that was assigned. This
156 means that a smaller valid sample can be produced by taking the first N
157 lines of output. For more info on the sampling approach see:
158 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling
159 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis
160   (https://arxiv.org/abs/1012.0256)
161 
162 Printing random values: Most of the sampling algorithms work by generating
163 a random value for each line. (See "Compatibility mode" below.) The nature
164 of these values depends on the sampling algorithm. They are used for both
165 line selection and output ordering. The '--p|print-random' option can be
166 used to print these values. The random value is prepended to the line
167 separated by the --d|delimiter char (TAB by default). The
168 '--gen-random-inorder' option takes this one step further, generating
169 random values for all input lines without changing the input order. The
170 types of values currently used by these sampling algorithms:
171 * Unweighted sampling: Uniform random value in the interval [0,1]. This
172   includes Bernoulli sampling and unweighted line order randomization.
173 * Weighted sampling: Value in the interval [0,1]. Distribution depends on
174   the values in the weight field. It is used as a partial ordering.
175 * Distinct sampling: An integer, zero and up, representing a selection
176   group. The inclusion probability determines the number of selection groups.
177 * Sampling with replacement: Random value printing is not supported.
178 
179 The specifics behind these random values are subject to change in future
180 releases.
181 
182 Compatibility mode: As described above, many of the sampling algorithms
183 assign a random value to each line. This is useful when printing random
184 values. It has another occasionally useful property: repeated runs with
185 the same static seed but different selection parameters are more
186 compatible with each other, as each line gets assigned the same random
187 value on every run. For example, if Bernoulli sampling is run with
188 '--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed',
189 all the lines selected in the first run will be selected in the second.
190 This comes at a cost: in some cases there are faster algorithms that don't
191 preserve this property. By default, tsv-sample will use faster algorithms
192 when available. However, the '--compatibility-mode' option switches to
193 algorithms that assign a random value per line. Printing random values
194 also engages compatibility mode.
195 
196 Options:
197 EOS";
198 
199 /** Container for command line options and derived data.
200  *
201  * TsvSampleOptions handles several aspects of command line options. On the input side,
202  * it defines the command line options available, performs validation, and sets up any
203  * derived state based on the options provided. These activities are handled by the
204  * processArgs() member.
205  *
206  * Once argument processing is complete, TsvSampleOptions is used as a container
207  * holding the specific processing options used by the different sampling routines.
208  */
209 struct TsvSampleOptions
210 {
211     import tsv_utils.common.utils : InputSourceRange;
212 
213     string programName;                        /// Program name
214     InputSourceRange inputSources;             /// Input files
215     bool hasHeader = false;                    /// --H|header
216     ulong sampleSize = 0;                      /// --n|num - Size of the desired sample
217     double inclusionProbability = double.nan;  /// --p|prob - Inclusion probability
218     size_t[] keyFields;                        /// Derived: --k|key-fields - Used with inclusion probability
219     size_t weightField = 0;                    /// Derived: --w|weight-field - Field holding the weight
220     bool srsWithReplacement = false;           /// --r|replace
221     bool preserveInputOrder = false;           /// --i|inorder
222     bool staticSeed = false;                   /// --s|static-seed
223     uint seedValueOptionArg = 0;               /// --v|seed-value
224     bool printRandom = false;                  /// --print-random
225     bool genRandomInorder = false;             /// --gen-random-inorder
226     string randomValueHeader = "random_value"; /// --random-value-header
227     bool compatibilityMode = false;            /// --compatibility-mode
228     char delim = '\t';                         /// --d|delimiter
229     bool lineBuffered = false;                 /// --line-buffered
230     bool preferSkipSampling = false;           /// --prefer-skip-sampling
231     bool preferAlgorithmR = false;             /// --prefer-algorithm-r
232     bool hasWeightField = false;               /// Derived.
233     bool useBernoulliSampling = false;         /// Derived.
234     bool useDistinctSampling = false;          /// Derived.
235     bool distinctKeyIsFullLine = false;        /// Derived. True if '--k|key-fields 0' is specfied.
236     bool usingUnpredictableSeed = true;        /// Derived from --static-seed, --seed-value
237     uint seed = 0;                             /// Derived from --static-seed, --seed-value
238 
239     /** Process tsv-sample command line arguments.
240      *
241      * Defines the command line options, performs validation, and derives additional
242      * state. std.getopt.getopt is called to do the main option processing followed
243      * additional validation and derivation.
244      *
245      * Help text is printed to standard output if help was requested. Error text is
246      * written to stderr if invalid input is encountered.
247      *
248      * A tuple is returned. First value is true if command line arguments were
249      * successfully processed and execution should continue, or false if an error
250      * occurred or the user asked for help. If false, the second value is the
251      * appropriate exit code (0 or 1).
252      *
253      * Returning true (execution continues) means args have been validated and derived
254      * values calculated. Field indices will have been converted to zero-based.
255      */
256     auto processArgs(ref string[] cmdArgs)
257     {
258         import std.algorithm : all, canFind, each;
259         import std.conv : to;
260         import std.getopt;
261         import std.math : isNaN;
262         import std.path : baseName, stripExtension;
263         import std.typecons : Yes, No;
264         import tsv_utils.common.utils : inputSourceRange, ReadHeader, throwIfWindowsNewline;
265         import tsv_utils.common.fieldlist;
266 
267         bool helpVerbose = false;                  // --help-verbose
268         bool helpFields = false;                   // --help-fields
269         bool versionWanted = false;                // --V|version
270         string keyFieldsArg;                       // --k|key-fields
271         string weightFieldArg;                     // --w|weight-field
272 
273         string keyFieldsOptionString = "k|key-fields";
274         string weightFieldOptionString = "w|weight-field";
275 
276         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
277 
278         try
279         {
280             arraySep = ",";    // Use comma to separate values in command line options
281             auto r = getopt(
282                 cmdArgs,
283                 "help-verbose",    "     Print more detailed help.", &helpVerbose,
284                 "help-fields",     "     Print help on specifying fields.", &helpFields,
285 
286                 std.getopt.config.caseSensitive,
287                 "H|header",        "     Treat the first line of each file as a header.", &hasHeader,
288                 std.getopt.config.caseInsensitive,
289 
290                 "n|num",           "NUM  Maximum number of lines to output. All selected lines are output if not provided or zero.", &sampleSize,
291                 "p|prob",          "NUM  Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability,
292 
293                 keyFieldsOptionString,
294                 "<field-list>  Fields to use as key for distinct sampling. Use with '--p|prob'. Specify '--k|key-fields 0' to use the entire line as the key.",
295                 &keyFieldsArg,
296 
297                 weightFieldOptionString,
298                 "NUM  Field containing weights. All lines get equal weight if not provided.",
299                 &weightFieldArg,
300 
301                 "r|replace",       "     Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement,
302                 "i|inorder",       "     Output random samples in original input order. Requires use of --n|num.", &preserveInputOrder,
303                 "s|static-seed",   "     Use the same random seed every run.", &staticSeed,
304 
305                 std.getopt.config.caseSensitive,
306                 "v|seed-value",    "NUM  Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg,
307                 std.getopt.config.caseInsensitive,
308 
309                 "print-random",       "     Include the assigned random value (prepended) when writing output lines.", &printRandom,
310                 "gen-random-inorder", "     Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder,
311                 "random-value-header",  "     Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader,
312                 "compatibility-mode", "     Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode,
313 
314                 "d|delimiter",     "CHR  Field delimiter.", &delim,
315                 "line-buffered",   "     Immediately output every sampled line. Applies to Bernoulli and distinct sampling. Ignored in modes where all input data must be read before generating output.", &lineBuffered,
316 
317                 std.getopt.config.caseSensitive,
318                 "V|version",       "     Print version information and exit.", &versionWanted,
319                 std.getopt.config.caseInsensitive,
320 
321                 "prefer-skip-sampling", "     (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.",
322                 &preferSkipSampling,
323 
324                 "prefer-algorithm-r",   "     (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.",
325                 &preferAlgorithmR,
326                 );
327 
328             if (r.helpWanted)
329             {
330                 defaultGetoptPrinter(helpText, r.options);
331                 return tuple(false, 0);
332             }
333             else if (helpVerbose)
334             {
335                 defaultGetoptPrinter(helpTextVerbose, r.options);
336                 return tuple(false, 0);
337             }
338             else if (helpFields)
339             {
340                 writeln(fieldListHelpText);
341                 return tuple(false, 0);
342             }
343             else if (versionWanted)
344             {
345                 import tsv_utils.common.tsvutils_version;
346                 writeln(tsvutilsVersionNotice("tsv-sample"));
347                 return tuple(false, 0);
348             }
349 
350             /* Input files. Remaining command line args are files. */
351             string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
352             cmdArgs.length = 1;
353 
354             /* Validation and derivations - Do as much validation prior to header line
355              * processing as possible (avoids waiting on stdin).
356              *
357              * Note: keyFields and weightField depend on header line processing, but
358              * keyFieldsArg and weightFieldArg can be used to detect whether the
359              * command line argument was specified.
360              */
361 
362             /* Set hasWeightField here so it can be used in other validation checks.
363              * Field validity checked after reading file header.
364              */
365             hasWeightField = !weightFieldArg.empty;
366 
367             /* Sampling with replacement checks (--r|replace). */
368             if (srsWithReplacement)
369             {
370                 enforce(!hasWeightField,
371                         "Sampling with replacement (--r|replace) does not support weights (--w|weight-field).");
372 
373                 enforce(inclusionProbability.isNaN,
374                         "Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob).");
375 
376                 enforce(keyFieldsArg.empty,
377                         "Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields).");
378 
379                 enforce(!printRandom && !genRandomInorder,
380                         "Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder).");
381 
382                 enforce(!preserveInputOrder,
383                         "Sampling with replacement (--r|replace) does not support input order preservation (--i|inorder option).");
384             }
385 
386             /* Distinct sampling checks (--k|key-fields --p|prob). */
387             enforce(keyFieldsArg.empty | !inclusionProbability.isNaN,
388                     "--p|prob is required when using --k|key-fields.");
389 
390             /* Inclusion probability (--p|prob) is used for both Bernoulli sampling
391              * and distinct sampling.
392              */
393             if (!inclusionProbability.isNaN)
394             {
395                 enforce(inclusionProbability > 0.0 && inclusionProbability <= 1.0,
396                         format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability));
397 
398                 if (!keyFieldsArg.empty) useDistinctSampling = true;
399                 else useBernoulliSampling = true;
400 
401                 enforce(!hasWeightField, "--w|weight-field and --p|prob cannot be used together.");
402 
403                 enforce(!genRandomInorder || useDistinctSampling,
404                         "--gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used." ~
405                         "\nUse --gen-random-inorder alone to print probabilities for all lines." ~
406                         "\nUse --p|prob and --print-random to print probabilities for lines satisfying the probability threshold.");
407             }
408             else if (genRandomInorder && !hasWeightField)
409             {
410                 useBernoulliSampling = true;
411             }
412 
413             /* randomValueHeader (--random-value-header) validity. Note that
414                randomValueHeader is initialized to a valid, non-empty string.
415             */
416             enforce(!randomValueHeader.empty && !randomValueHeader.canFind('\n') &&
417                     !randomValueHeader.canFind(delim),
418                     "--randomValueHeader must be at least one character and not contain field delimiters or newlines.");
419 
420             /* Check for incompatible use of (--i|inorder) and shuffling of the full
421              * data set. Sampling with replacement is also incompatible, this is
422              * detected earlier. Shuffling is the default operation, so it identified
423              * by eliminating the other modes of operation.
424              */
425             enforce(!preserveInputOrder ||
426                     sampleSize != 0 ||
427                     useBernoulliSampling ||
428                     useDistinctSampling,
429                     "Preserving input order (--i|inorder) is not compatible with full data set shuffling. Switch to random sampling with a sample size (--n|num) to use --i|inorder.");
430 
431             /* Compatibility mode checks:
432              * - Random value printing implies compatibility-mode, otherwise user's
433              *   selection is used.
434              * - Distinct sampling doesn't support compatibility-mode. The routines
435              *   don't care, but users might expect larger probabilities to be a
436              *   superset of smaller probabilities. This would be confusing, so
437              *   flag it as an error.
438              */
439             enforce(!(compatibilityMode && useDistinctSampling),
440                     "Distinct sampling (--k|key-fields --p|prob) does not support --compatibility-mode.");
441 
442             if (printRandom || genRandomInorder) compatibilityMode = true;
443 
444             /* Ignore --line-buffered if not using Bernoulli or distinct sampling. */
445             if (!useBernoulliSampling && !useDistinctSampling) lineBuffered = false;
446 
447             /* Seed. */
448             import std.random : unpredictableSeed;
449 
450             usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0);
451 
452             if (usingUnpredictableSeed) seed = unpredictableSeed;
453             else if (seedValueOptionArg != 0) seed = seedValueOptionArg;
454             else if (staticSeed) seed = 2438424139;
455             else assert(0, "Internal error, invalid seed option states.");
456 
457             string[] headerFields;
458 
459             /* fieldListArgProcessing encapsulates the field list processing. It is
460              * called prior to reading the header line if headers are not being used,
461              * and after if headers are being used.
462              */
463             void fieldListArgProcessing()
464             {
465                 if (!weightFieldArg.empty)
466                 {
467                     auto fieldIndices =
468                         weightFieldArg
469                         .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero)
470                         (hasHeader, headerFields, weightFieldOptionString)
471                         .array;
472 
473                     enforce(fieldIndices.length == 1,
474                             format("'--%s' must be a single field.", weightFieldOptionString));
475 
476                     weightField = fieldIndices[0];
477                 }
478 
479                 if (!keyFieldsArg.empty)
480                 {
481                     keyFields =
482                         keyFieldsArg
483                         .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)
484                         (hasHeader, headerFields, keyFieldsOptionString)
485                         .array;
486 
487                     assert(keyFields.length > 0);
488 
489                     if (keyFields.length > 0)
490                     {
491                         if (keyFields.length == 1 && keyFields[0] == 0)
492                         {
493                             distinctKeyIsFullLine = true;
494                         }
495                         else
496                         {
497                             enforce(keyFields.length <= 1 || keyFields.all!(x => x != 0),
498                                     "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.");
499 
500                             keyFields.each!((ref x) => --x);  // Convert to zero-based indexing.
501                         }
502                     }
503                 }
504             }
505 
506             if (!hasHeader) fieldListArgProcessing();
507 
508             /*
509              * Create the inputSourceRange and perform header line processing.
510              */
511             ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
512             inputSources = inputSourceRange(filepaths, readHeader);
513 
514             if (hasHeader)
515             {
516                 throwIfWindowsNewline(inputSources.front.header, inputSources.front.name, 1);
517                 headerFields = inputSources.front.header.split(delim).to!(string[]);
518                 fieldListArgProcessing();
519             }
520 
521         }
522         catch (Exception exc)
523         {
524             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
525             return tuple(false, 1);
526         }
527         return tuple(true, 0);
528     }
529 }
530 /** Invokes the appropriate sampling routine based on the command line arguments.
531  *
532  * tsvSample is the top-level routine handling the different tsv-sample use cases.
533  * Its primary role is to invoke the correct routine for type of sampling requested.
534  */
535 void tsvSample(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
536 if (isOutputRange!(OutputRange, char))
537 {
538     if (cmdopt.srsWithReplacement)
539     {
540         simpleRandomSamplingWithReplacement(cmdopt, outputStream);
541     }
542     else if (cmdopt.useBernoulliSampling)
543     {
544         bernoulliSamplingCommand(cmdopt, outputStream);
545     }
546     else if (cmdopt.useDistinctSampling)
547     {
548         if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
549         else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream);
550     }
551     else if (cmdopt.genRandomInorder)
552     {
553         /* Note that the preceding cases handle gen-random-inorder themselves (Bernoulli,
554          * Distinct), or don't handle it (SRS w/ Replacement).
555          */
556         assert(cmdopt.hasWeightField);
557         generateWeightedRandomValuesInorder(cmdopt, outputStream);
558     }
559     else if (cmdopt.sampleSize != 0)
560     {
561         randomSamplingCommand(cmdopt, outputStream);
562     }
563     else
564     {
565         shuffleCommand(cmdopt, outputStream);
566     }
567 }
568 
569 /** Bernoulli sampling command handler. Invokes the appropriate Bernoulli sampling
570  * routine based on the command line arguments.
571  *
572  * This routine selects the appropriate Bernoulli sampling function and template
573  * instantiation to use based on the command line arguments.
574  *
575  * One of the basic choices is whether to use the vanilla algorithm or skip sampling.
576  * Skip sampling is a little bit faster when the inclusion probability is small but
577  * doesn't support compatibility mode. See the bernoulliSkipSampling documentation
578  * for a discussion of the skipSamplingProbabilityThreshold used here.
579  */
580 void bernoulliSamplingCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
581 if (isOutputRange!(OutputRange, char))
582 {
583     assert(!cmdopt.hasWeightField);
584 
585     immutable double skipSamplingProbabilityThreshold = 0.04;
586 
587     if (cmdopt.compatibilityMode ||
588         (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling))
589     {
590         if (cmdopt.genRandomInorder)
591         {
592             bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
593         }
594         else
595         {
596             bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream);
597         }
598     }
599     else
600     {
601         bernoulliSkipSampling(cmdopt, outputStream);
602     }
603 }
604 
605 /** Bernoulli sampling of lines from the input stream.
606  *
607  * Each input line is a assigned a random value and output if less than
608  * cmdopt.inclusionProbability. The order of the lines is not changed.
609  *
610  * This routine supports random value printing and gen-random-inorder value printing.
611  */
612 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
613     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
614 if (isOutputRange!(OutputRange, char))
615 {
616     import std.random : Random = Mt19937, uniform01;
617     import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange,
618         InputSourceRange, LineBuffered, throwIfWindowsNewline;
619 
620     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
621     else assert(!cmdopt.genRandomInorder);
622 
623     assert(!cmdopt.inputSources.empty);
624     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
625 
626     auto randomGenerator = Random(cmdopt.seed);
627 
628     /* First header is read during command line argument processing. */
629     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
630     {
631         auto inputStream = cmdopt.inputSources.front;
632 
633         static if (generateRandomAll)
634         {
635             outputStream.put(cmdopt.randomValueHeader);
636             outputStream.put(cmdopt.delim);
637         }
638         else if (cmdopt.printRandom)
639         {
640             outputStream.put(cmdopt.randomValueHeader);
641             outputStream.put(cmdopt.delim);
642         }
643 
644         outputStream.put(inputStream.header);
645         outputStream.put("\n");
646 
647         /* Immediately flush the header so subsequent processes in a unix command
648          * pipeline see it early. This helps provide timely error messages.
649          */
650         static if (isFlushableOutputRange!OutputRange) outputStream.flush;
651     }
652 
653     /* Process each line. */
654     immutable LineBuffered isLineBuffered = cmdopt.lineBuffered ? Yes.lineBuffered : No.lineBuffered;
655     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
656     ulong numLinesWritten = 0;
657 
658     foreach (inputStream; cmdopt.inputSources)
659     {
660         if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1);
661 
662         foreach (ulong fileLineNum, line;
663                  inputStream
664                  .file
665                  .bufferedByLine!(KeepTerminator.no)(isLineBuffered)
666                  .enumerate(fileBodyStartLine))
667         {
668             if (fileLineNum == 1) throwIfWindowsNewline(line, inputStream.name, fileLineNum);
669 
670             immutable double lineScore = uniform01(randomGenerator);
671 
672             static if (generateRandomAll)
673             {
674                 outputStream.formatRandomValue(lineScore);
675                 outputStream.put(cmdopt.delim);
676                 outputStream.put(line);
677                 outputStream.put("\n");
678 
679                 if (cmdopt.sampleSize != 0)
680                 {
681                     ++numLinesWritten;
682                     if (numLinesWritten == cmdopt.sampleSize) return;
683                 }
684             }
685             else if (lineScore < cmdopt.inclusionProbability)
686             {
687                 if (cmdopt.printRandom)
688                 {
689                     outputStream.formatRandomValue(lineScore);
690                     outputStream.put(cmdopt.delim);
691                 }
692                 outputStream.put(line);
693                 outputStream.put("\n");
694 
695                 if (cmdopt.sampleSize != 0)
696                 {
697                     ++numLinesWritten;
698                     if (numLinesWritten == cmdopt.sampleSize) return;
699                 }
700             }
701         }
702     }
703 }
704 
705 /** bernoulliSkipSampling is an implementation of Bernoulli sampling using skips.
706  *
707  * Skip sampling works by skipping a random number of lines between selections. This
708  * can be faster than assigning a random value to each line when the inclusion
709  * probability is low, as it reduces the number of calls to the random number
710  * generator. Both the random number generator and the log() function are called when
711  * calculating the next skip size. These additional log() calls add up as the
712  * inclusion probability increases.
713  *
714  * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for
715  * file-oriented line sampling. This is obviously environment specific. In the
716  * environments this implementation has been tested in the performance improvements
717  * remain small, less than 7%, even with an inclusion probability as low as 0.0001.
718  *
719  * The algorithm does not assign random values to individual lines. This makes it
720  * incompatible with random value printing. It is not suitable for compatibility mode
721  * either. As an example, in compatibility mode a line selected with '--prob 0.2' should
722  * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling
723  * does not have this property.
724  *
725  * The algorithm for calculating the skip size has been described by multiple sources.
726  * There are two key variants depending on whether the total number of lines in the
727  * data set is known in advance. (This implementation does not know the total.)
728  * Useful references:
729  * $(LIST
730  *     * Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling",
731  *       ACM Trans on Mathematical Software, 1987. On-line:
732  *       http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf
733  *     * P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book
734  *       "Data Stream Management", Springer-Verlag, 2016. On-line:
735  *       https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf
736  *     * Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line:
737  *       http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/
738  * )
739  */
740 void bernoulliSkipSampling(OutputRange)(ref TsvSampleOptions cmdopt, OutputRange outputStream)
741     if (isOutputRange!(OutputRange, char))
742 {
743     import std.conv : to;
744     import std.math : log, trunc;
745     import std.random : Random = Mt19937, uniform01;
746     import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange,
747         InputSourceRange, LineBuffered, throwIfWindowsNewline;
748 
749     assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0);
750     assert(!cmdopt.printRandom);
751     assert(!cmdopt.compatibilityMode);
752 
753     assert(!cmdopt.inputSources.empty);
754     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
755 
756     auto randomGenerator = Random(cmdopt.seed);
757 
758     immutable double discardRate = 1.0 - cmdopt.inclusionProbability;
759     immutable double logDiscardRate = log(discardRate);
760 
761     /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed
762      * interval to (0.0, 1.0], excluding 0.0.
763      */
764     size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
765 
766     /* First header is read during command line argument processing. */
767     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
768     {
769         auto inputStream = cmdopt.inputSources.front;
770 
771         outputStream.put(inputStream.header);
772         outputStream.put("\n");
773 
774         /* Immediately flush the header so subsequent processes in a unix command
775          * pipeline see it early. This helps provide timely error messages.
776          */
777         static if (isFlushableOutputRange!OutputRange) outputStream.flush;
778     }
779 
780     /* Process each line. */
781     immutable LineBuffered isLineBuffered = cmdopt.lineBuffered ? Yes.lineBuffered : No.lineBuffered;
782     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
783     ulong numLinesWritten = 0;
784     foreach (inputStream; cmdopt.inputSources)
785     {
786         if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1);
787 
788         foreach (ulong fileLineNum, line;
789                  inputStream
790                  .file
791                  .bufferedByLine!(KeepTerminator.no)(isLineBuffered)
792                  .enumerate(fileBodyStartLine))
793         {
794             if (fileLineNum == 1) throwIfWindowsNewline(line, inputStream.name, fileLineNum);
795 
796             if (remainingSkips > 0)
797             {
798                 --remainingSkips;
799             }
800             else
801             {
802                 outputStream.put(line);
803                 outputStream.put("\n");
804 
805                 if (cmdopt.sampleSize != 0)
806                 {
807                     ++numLinesWritten;
808                     if (numLinesWritten == cmdopt.sampleSize) return;
809                 }
810 
811                 remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
812             }
813         }
814     }
815 }
816 
817 /** Sample lines by choosing a random set of distinct keys formed from one or more
818  * fields on each line.
819  *
820  * Distinct sampling is a streaming form of sampling, similar to Bernoulli sampling.
821  * However, instead of each line being subject to an independent trial, lines are
822  * selected based on a key from each line. A portion of keys are randomly selected for
823  * output, and every line containing a selected key is included in the output.
824  *
825  * An example use-case is a query log having <user, query, clicked-url> triples. It is
826  * often useful to sample records for portion of the users, but including all records
827  * for the users selected. Distinct sampling supports this by selecting a subset of
828  * users to include in the output.
829  *
830  * Distinct sampling is done by hashing the key and mapping the hash value into
831  * buckets sized to hold the inclusion probability. Records having a key mapping to
832  * bucket zero are output. Buckets are equal size and therefore may be larger than the
833  * inclusion probability. (The other approach would be to have the caller specify the
834  * the number of buckets. More correct, but less convenient.)
835  */
836 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
837     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
838 if (isOutputRange!(OutputRange, char))
839 {
840     import std.algorithm : splitter;
841     import std.conv : to;
842     import std.digest.murmurhash;
843     import std.math : lrint;
844     import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange,
845         InputFieldReordering, InputSourceRange, LineBuffered, throwIfWindowsNewline;
846 
847     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
848     else assert(!cmdopt.genRandomInorder);
849 
850     assert(cmdopt.keyFields.length > 0);
851     assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0);
852 
853     assert(!cmdopt.inputSources.empty);
854     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
855 
856     static if (generateRandomAll)
857     {
858         import std.format : formatValue, singleSpec;
859         immutable randomValueFormatSpec = singleSpec("%d");
860     }
861 
862     immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys.
863 
864     uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint;
865 
866     /* Create a mapping for the key fields. */
867     auto keyFieldsReordering = cmdopt.distinctKeyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields);
868 
869     /* First header is read during command line argument processing. */
870     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
871     {
872         auto inputStream = cmdopt.inputSources.front;
873 
874         static if (generateRandomAll)
875         {
876             outputStream.put(cmdopt.randomValueHeader);
877             outputStream.put(cmdopt.delim);
878         }
879         else if (cmdopt.printRandom)
880         {
881             outputStream.put(cmdopt.randomValueHeader);
882             outputStream.put(cmdopt.delim);
883         }
884 
885         outputStream.put(inputStream.header);
886         outputStream.put("\n");
887 
888         /* Immediately flush the header so subsequent processes in a unix command
889          * pipeline see it early. This helps provide timely error messages.
890          */
891         static if (isFlushableOutputRange!OutputRange) outputStream.flush;
892     }
893 
894     /* Process each line. */
895     immutable LineBuffered isLineBuffered = cmdopt.lineBuffered ? Yes.lineBuffered : No.lineBuffered;
896     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
897     ulong numLinesWritten = 0;
898 
899     foreach (inputStream; cmdopt.inputSources)
900     {
901         if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1);
902 
903         foreach (ulong fileLineNum, line;
904                  inputStream
905                  .file
906                  .bufferedByLine!(KeepTerminator.no)(isLineBuffered)
907                  .enumerate(fileBodyStartLine))
908         {
909             if (fileLineNum == 1) throwIfWindowsNewline(line, inputStream.name, fileLineNum);
910 
911             /* Murmurhash works by successively adding individual keys, then finalizing.
912              * Adding individual keys is simpler if the full-line-as-key and individual
913              * fields as keys cases are separated.
914              */
915             auto hasher = MurmurHash3!32(cmdopt.seed);
916 
917             if (cmdopt.distinctKeyIsFullLine)
918             {
919                 hasher.put(cast(ubyte[]) line);
920             }
921             else
922             {
923                 assert(keyFieldsReordering !is null);
924 
925                 /* Gather the key field values and assemble the key. */
926                 keyFieldsReordering.initNewLine;
927                 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
928                 {
929                     keyFieldsReordering.processNextField(fieldIndex, fieldValue);
930                     if (keyFieldsReordering.allFieldsFilled) break;
931                 }
932 
933                 enforce(keyFieldsReordering.allFieldsFilled,
934                         format("Not enough fields in line. File: %s, Line: %s",
935                                inputStream.name, fileLineNum));
936 
937                 foreach (count, key; keyFieldsReordering.outputFields.enumerate)
938                 {
939                     if (count > 0) hasher.put(delimArray);
940                     hasher.put(cast(ubyte[]) key);
941                 }
942             }
943 
944             hasher.finish;
945 
946             static if (generateRandomAll)
947             {
948                 import std.conv : to;
949                 outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec);
950                 outputStream.put(cmdopt.delim);
951                 outputStream.put(line);
952                 outputStream.put("\n");
953 
954                 if (cmdopt.sampleSize != 0)
955                 {
956                     ++numLinesWritten;
957                     if (numLinesWritten == cmdopt.sampleSize) return;
958                 }
959             }
960             else if (hasher.get % numBuckets == 0)
961             {
962                 if (cmdopt.printRandom)
963                 {
964                     outputStream.put('0');
965                     outputStream.put(cmdopt.delim);
966                 }
967                 outputStream.put(line);
968                 outputStream.put("\n");
969 
970                 if (cmdopt.sampleSize != 0)
971                 {
972                     ++numLinesWritten;
973                     if (numLinesWritten == cmdopt.sampleSize) return;
974                 }
975             }
976         }
977     }
978 }
979 
980 /** Random sampling command handler. Invokes the appropriate sampling routine based on
981  * the command line arguments.
982  *
983  * Random sampling selects a fixed size random sample from the input stream. Both
984  * simple random sampling (equal likelihood) and weighted random sampling are
985  * supported. Selected lines are output either in random order or original input order.
986  * For weighted sampling the random order is the weighted selection order.
987  *
988  * Two algorithms are used, reservoir sampling via a heap and reservoir sampling via
989  * Algorithm R. This routine selects the appropriate reservoir sampling function and
990  * template instantiation to based on the command line arguments.
991  *
992  * Weighted sampling always uses the heap approach. Compatibility mode does as well,
993  * as it is the method that uses per-line random value assignments. The implication
994  * of compatibility mode is that a larger sample size includes all the results from
995  * a smaller sample, assuming the same random seed is used.
996  *
997  * For unweighted sampling there is a performance tradeoff between implementations.
998  * Heap-based sampling is faster for small sample sizes. Algorithm R is faster for
999  * large sample sizes. The threshold used was chosen based on performance tests. See
1000  * the reservoirSamplingAlgorithmR documentation for more information.
1001  */
1002 
1003 void randomSamplingCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1004 if (isOutputRange!(OutputRange, char))
1005 {
1006     assert(cmdopt.sampleSize != 0);
1007 
1008     immutable size_t algorithmRSampleSizeThreshold = 128 * 1024;
1009 
1010     if (cmdopt.hasWeightField)
1011     {
1012         if (cmdopt.preserveInputOrder)
1013         {
1014             reservoirSamplingViaHeap!(Yes.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream);
1015         }
1016         else
1017         {
1018             reservoirSamplingViaHeap!(Yes.isWeighted, No.preserveInputOrder)(cmdopt, outputStream);
1019         }
1020     }
1021     else if (cmdopt.compatibilityMode ||
1022              (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR))
1023     {
1024         if (cmdopt.preserveInputOrder)
1025         {
1026             reservoirSamplingViaHeap!(No.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream);
1027         }
1028         else
1029         {
1030             reservoirSamplingViaHeap!(No.isWeighted, No.preserveInputOrder)(cmdopt, outputStream);
1031         }
1032     }
1033     else if (cmdopt.preserveInputOrder)
1034     {
1035         reservoirSamplingAlgorithmR!(Yes.preserveInputOrder)(cmdopt, outputStream);
1036     }
1037     else
1038     {
1039         reservoirSamplingAlgorithmR!(No.preserveInputOrder)(cmdopt, outputStream);
1040     }
1041 }
1042 
1043 /** Reservoir sampling using a heap. Both weighted and unweighted random sampling are
1044  * supported.
1045  *
1046  * The algorithm used here is based on the one-pass algorithm described by Pavlos
1047  * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S.
1048  * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are
1049  * simply set to one.
1050  *
1051  * The implementation uses a heap (priority queue) large enough to hold the desired
1052  * number of lines. Input is read line-by-line, assigned a random value, and added to
1053  * the heap. The role of the heap is to identify the lines with the highest assigned
1054  * random values. Once the heap is full, adding a new line means dropping the line with
1055  * the lowest score. A "min" heap used for this reason.
1056  *
1057  * When done reading all lines, the "min" heap is in reverse of weighted selection
1058  * order. Weighted selection order is obtained by removing each element one at at time
1059  * from the heap. The underlying data store will have the elements in weighted selection
1060  * order (largest weights first).
1061  *
1062  * Generating output in weighted order is useful for several reasons:
1063  *  - For weighted sampling, it preserves the property that smaller valid subsets can be
1064  *    created by taking the first N lines.
1065  *  - For unweighted sampling, it ensures that all output permutations are possible, and
1066  *    are not influenced by input order or the heap data structure used.
1067  *  - Order consistency is maintained when making repeated use of the same random seed,
1068  *    but with different sample sizes.
1069  *
1070  * The other choice is preserving input order. This is supporting by recording line
1071  * numbers and sorting the selected sample.
1072  *
1073  * There are use cases where only the selection set matters. For these some performance
1074  * could be gained by skipping the reordering and simply printing the backing store
1075  * array in-order. Performance tests indicate only a minor benefit, so this is not
1076  * supported.
1077  *
1078  * Notes:
1079  * $(LIST
1080  *    * In tsv-sample versions 1.2.1 and earlier this routine also supported
1081  *      randomization of all input lines. This was dropped in version 1.2.2 in favor
1082  *      of the approach used in randomizeLines. The latter has significant advantages
1083  *      given that all data must be read into memory.
1084  *    * For large reservoir sizes better performance can be achieved using Algorithm R.
1085  *      See the reservoirSamplingAlgorithmR documentation for details.
1086  * )
1087  */
1088 void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, Flag!"preserveInputOrder" preserveInputOrder, OutputRange)
1089     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1090 if (isOutputRange!(OutputRange, char))
1091 {
1092     import std.algorithm : sort;
1093     import std.container.array;
1094     import std.container.binaryheap;
1095     import std.meta : AliasSeq;
1096     import std.random : Random = Mt19937, uniform01;
1097     import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange,
1098         InputSourceRange, throwIfWindowsNewline;
1099 
1100     static if (isWeighted) assert(cmdopt.hasWeightField);
1101     else assert(!cmdopt.hasWeightField);
1102 
1103     assert(cmdopt.sampleSize > 0);
1104 
1105     assert(!cmdopt.inputSources.empty);
1106     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1107 
1108     auto randomGenerator = Random(cmdopt.seed);
1109 
1110     static struct Entry(Flag!"preserveInputOrder" preserveInputOrder)
1111     {
1112         double score;
1113         const(char)[] line;
1114         static if (preserveInputOrder) ulong lineNumber;
1115     }
1116 
1117     /* Create the heap and backing data store.
1118      *
1119      * Note: An std.container.array is used as the backing store to avoid some issues in
1120      * the standard library (Phobos) binaryheap implementation. Specifically, when an
1121      * std.container.array is used as backing store, the heap can efficiently reversed by
1122      * removing the heap elements. This leaves the backing store in the reversed order.
1123      * However, the current binaryheap implementation does not support this for all
1124      * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094.
1125      */
1126 
1127     Array!(Entry!preserveInputOrder) dataStore;
1128     dataStore.reserve(cmdopt.sampleSize);
1129     auto reservoir = dataStore.heapify!("a.score > b.score")(0);  // Min binaryheap
1130 
1131     /* First header is read during command line argument processing. */
1132     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1133     {
1134         auto inputStream = cmdopt.inputSources.front;
1135 
1136         if (cmdopt.printRandom)
1137         {
1138             outputStream.put(cmdopt.randomValueHeader);
1139             outputStream.put(cmdopt.delim);
1140         }
1141         outputStream.put(inputStream.header);
1142         outputStream.put("\n");
1143 
1144         /* Immediately flush the header so subsequent processes in a unix command
1145          * pipeline see it early. This helps provide timely error messages.
1146          */
1147         static if (isFlushableOutputRange!OutputRange) outputStream.flush;
1148     }
1149 
1150     /* Process each line. */
1151     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
1152     static if (preserveInputOrder) ulong totalLineNum = 0;
1153 
1154     foreach (inputStream; cmdopt.inputSources)
1155     {
1156         if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1);
1157 
1158         foreach (ulong fileLineNum, line;
1159                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
1160         {
1161             if (fileLineNum == 1) throwIfWindowsNewline(line, inputStream.name, fileLineNum);
1162 
1163             static if (!isWeighted)
1164             {
1165                 immutable double lineScore = uniform01(randomGenerator);
1166             }
1167             else
1168             {
1169                 immutable double lineWeight =
1170                     getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, inputStream.name, fileLineNum);
1171                 immutable double lineScore =
1172                     (lineWeight > 0.0)
1173                     ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1174                     : 0.0;
1175             }
1176 
1177             static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum);
1178             else alias entryCTArgs = AliasSeq!();
1179 
1180             if (reservoir.length < cmdopt.sampleSize)
1181             {
1182                 reservoir.insert(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs));
1183             }
1184             else if (reservoir.front.score < lineScore)
1185             {
1186                 reservoir.replaceFront(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs));
1187             }
1188 
1189             static if (preserveInputOrder) ++totalLineNum;
1190         }
1191     }
1192 
1193     /* Done with input, all entries are in the reservoir. */
1194 
1195     /* The asserts here avoid issues with the current binaryheap implementation. They
1196      * detect use of backing stores having a length not synchronized to the reservoir.
1197      */
1198     immutable ulong numLines = reservoir.length;
1199     assert(numLines == dataStore.length);
1200 
1201     /* Update the backing store so it is in the desired output order.
1202      */
1203     static if (preserveInputOrder)
1204     {
1205         dataStore[].sort!((a, b) => a.lineNumber < b.lineNumber);
1206     }
1207     else
1208     {
1209         /* Output in weighted selection order. The heap is in reverse order of assigned
1210          * weights. Reversing order is done by removing all elements from the heap. This
1211          * leaves the backing store in the correct order.
1212          */
1213         while (!reservoir.empty) reservoir.removeFront;
1214     }
1215 
1216     assert(numLines == dataStore.length);
1217 
1218     foreach (entry; dataStore)
1219     {
1220         if (cmdopt.printRandom)
1221         {
1222             outputStream.formatRandomValue(entry.score);
1223             outputStream.put(cmdopt.delim);
1224         }
1225         outputStream.put(entry.line);
1226         outputStream.put("\n");
1227     }
1228  }
1229 
1230 /** Generate weighted random values for all input lines, preserving input order.
1231  *
1232  * This complements weighted reservoir sampling, but instead of using a reservoir it
1233  * simply iterates over the input lines generating the values. The weighted random
1234  * values are generated with the same formula used by reservoirSampling.
1235  */
1236 void generateWeightedRandomValuesInorder(OutputRange)
1237     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1238 if (isOutputRange!(OutputRange, char))
1239 {
1240     import std.random : Random = Mt19937, uniform01;
1241     import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange,
1242         InputSourceRange, throwIfWindowsNewline;
1243 
1244     assert(cmdopt.hasWeightField);
1245 
1246     assert(!cmdopt.inputSources.empty);
1247     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1248 
1249     auto randomGenerator = Random(cmdopt.seed);
1250 
1251     /* First header is read during command line argument processing. */
1252     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1253     {
1254         auto inputStream = cmdopt.inputSources.front;
1255 
1256         outputStream.put(cmdopt.randomValueHeader);
1257         outputStream.put(cmdopt.delim);
1258         outputStream.put(inputStream.header);
1259         outputStream.put("\n");
1260 
1261         /* Immediately flush the header so subsequent processes in a unix command
1262          * pipeline see it early. This helps provide timely error messages.
1263          */
1264         static if (isFlushableOutputRange!OutputRange) outputStream.flush;
1265     }
1266 
1267     /* Process each line. */
1268     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
1269     ulong numLinesWritten = 0;
1270 
1271     foreach (inputStream; cmdopt.inputSources)
1272     {
1273         if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1);
1274 
1275         foreach (ulong fileLineNum, line;
1276                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
1277         {
1278             if (fileLineNum == 1) throwIfWindowsNewline(line, inputStream.name, fileLineNum);
1279 
1280             immutable double lineWeight =
1281                 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, inputStream.name, fileLineNum);
1282 
1283             immutable double lineScore =
1284                 (lineWeight > 0.0)
1285                 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1286                 : 0.0;
1287 
1288             outputStream.formatRandomValue(lineScore);
1289             outputStream.put(cmdopt.delim);
1290             outputStream.put(line);
1291             outputStream.put("\n");
1292 
1293             if (cmdopt.sampleSize != 0)
1294             {
1295                 ++numLinesWritten;
1296                 if (numLinesWritten == cmdopt.sampleSize) return;
1297             }
1298         }
1299     }
1300 }
1301 
1302 /** Reservoir sampling via Algorithm R
1303  *
1304  * This is an implementation of reservoir sampling using what is commonly known as
1305  * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of
1306  * Computer Programming, Volume 2: Seminumerical Algorithms". More information about
1307  * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with
1308  * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling"
1309  * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R).
1310  *
1311  * Algorithm R is used for unweighted sampling without replacement. The heap-based
1312  * algorithm in reservoirSamplingViaHeap is used for weighted sampling.
1313  *
1314  * The classic algorithm stops after identifying the selected set of items. This
1315  * implementation goes one step further and randomizes the order of the selected
1316  * lines. This is consistent with shuffling (line order randomization), a primary
1317  * tsv-sample use-case.
1318  *
1319  * This algorithm is faster than reservoirSamplingViaHeap when the sample size
1320  * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size.
1321  * Insertion in this algorithm is O(1). Similarly, generating the random order in the
1322  * heap is O(k * log k), while in this algorithm the final randomization step is O(k).
1323  *
1324  * This speed advantage may be offset a certain amount by using a more expensive random
1325  * value generator. reservoirSamplingViaHeap generates values between zero and one,
1326  * whereas reservoirSamplingAlgorithmR generates random integers over and ever growing
1327  * interval. The latter is expected to be more expensive. This is consistent with
1328  * performance tests indicating that reservoirSamplingViaHeap is faster when using
1329  * small-to-medium size reservoirs and large input streams.
1330  */
1331 void reservoirSamplingAlgorithmR(Flag!"preserveInputOrder" preserveInputOrder, OutputRange)
1332     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1333 if (isOutputRange!(OutputRange, char))
1334 {
1335     import std.meta : AliasSeq;
1336     import std.random : Random = Mt19937, randomShuffle, uniform;
1337     import std.algorithm : sort;
1338     import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange,
1339         InputSourceRange, throwIfWindowsNewline;
1340 
1341     assert(cmdopt.sampleSize > 0);
1342     assert(!cmdopt.hasWeightField);
1343     assert(!cmdopt.compatibilityMode);
1344     assert(!cmdopt.printRandom);
1345     assert(!cmdopt.genRandomInorder);
1346 
1347     assert(!cmdopt.inputSources.empty);
1348     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1349 
1350     static struct Entry(Flag!"preserveInputOrder" preserveInputOrder)
1351     {
1352         const(char)[] line;
1353         static if (preserveInputOrder) ulong lineNumber;
1354     }
1355 
1356     Entry!preserveInputOrder[] reservoir;
1357     auto reservoirAppender = appender(&reservoir);
1358     reservoirAppender.reserve(cmdopt.sampleSize);
1359 
1360     auto randomGenerator = Random(cmdopt.seed);
1361 
1362     /* First header is read during command line argument processing. */
1363     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1364     {
1365         auto inputStream = cmdopt.inputSources.front;
1366 
1367         outputStream.put(inputStream.header);
1368         outputStream.put("\n");
1369 
1370         /* Immediately flush the header so subsequent processes in a unix command
1371          * pipeline see it early. This helps provide timely error messages.
1372          */
1373         static if (isFlushableOutputRange!OutputRange) outputStream.flush;
1374     }
1375 
1376     /* Process each line. */
1377     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
1378     ulong totalLineNum = 0;
1379 
1380     foreach (inputStream; cmdopt.inputSources)
1381     {
1382         if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1);
1383 
1384         foreach (ulong fileLineNum, line;
1385                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
1386         {
1387             if (fileLineNum == 1) throwIfWindowsNewline(line, inputStream.name, fileLineNum);
1388 
1389             /* Add lines to the reservoir until the reservoir is filled.
1390              * After that lines are added with decreasing likelihood, based on
1391              * the total number of lines seen. If added to the reservoir, the
1392              * line replaces a randomly chosen existing line.
1393              */
1394             static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum);
1395             else alias entryCTArgs = AliasSeq!();
1396 
1397             if (totalLineNum < cmdopt.sampleSize)
1398             {
1399                 reservoirAppender ~= Entry!preserveInputOrder(line.idup, entryCTArgs);
1400             }
1401             else
1402             {
1403                 immutable size_t i = uniform(0, totalLineNum, randomGenerator);
1404                 if (i < reservoir.length)
1405                 {
1406                     reservoir[i] = Entry!preserveInputOrder(line.idup, entryCTArgs);
1407                 }
1408             }
1409 
1410             ++totalLineNum;
1411         }
1412     }
1413 
1414     /* Done with input. The sample is in the reservoir. Update the order and print. */
1415 
1416     static if (preserveInputOrder)
1417     {
1418         reservoir.sort!((a, b) => a.lineNumber < b.lineNumber);
1419     }
1420     else
1421     {
1422         reservoir.randomShuffle(randomGenerator);
1423     }
1424 
1425     foreach (ref entry; reservoir)
1426     {
1427         outputStream.put(entry.line);
1428         outputStream.put("\n");
1429     }
1430 }
1431 
1432 /** Shuffling command handler. Invokes the appropriate shuffle (line order
1433  * randomization) routine based on the command line arguments.
1434  *
1435  * Shuffling has similarities to random sampling, but the algorithms used are
1436  * different. Random sampling selects a subset, only the current subset selection
1437  * needs to be kept in memory. This is supported by reservoir sampling. By contrast,
1438  * shuffling needs to hold all input in memory, so it works better to read all lines
1439  * into memory at once and then shuffle.
1440  *
1441  * Two different algorithms are used. Array shuffling is used for unweighted shuffling.
1442  * Sorting plus random weight assignments is used for weighted shuffling and when
1443  * compatibility mode is being used.
1444  *
1445  * The algorithms used here are all limited by available memory.
1446  */
1447 void shuffleCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1448 if (isOutputRange!(OutputRange, char))
1449 {
1450     if (cmdopt.hasWeightField)
1451     {
1452         randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream);
1453     }
1454     else if (cmdopt.compatibilityMode)
1455     {
1456         randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream);
1457     }
1458     else
1459     {
1460         randomizeLinesViaShuffle(cmdopt, outputStream);
1461     }
1462 }
1463 
1464 /** Shuffle all input lines by assigning random weights and sorting.
1465  *
1466  * randomizeLinesViaSort reads in all input lines and writes them out in random order.
1467  * The algorithm works by assigning a random value to each line and sorting. Both
1468  * weighted and unweighted shuffling are supported.
1469  *
1470  * Notes:
1471  * $(LIST
1472  *   * For unweighted shuffling randomizeLinesViaShuffle is faster and should be used
1473  *     unless compatibility mode is needed.
1474  *   * This routine is significantly faster than heap-based reservoir sampling in the
1475  *     case where the entire file is being read.
1476  *   * Input data must be read entirely in memory. Disk oriented techniques are needed
1477  *     when data sizes get too large for available memory. One option is to generate
1478  *     random values for each line, e.g. --gen-random-inorder, and sort with a disk-
1479  *     backed sort program like GNU sort.
1480  * )
1481  */
1482 void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange)
1483     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1484 if (isOutputRange!(OutputRange, char))
1485 {
1486     import std.algorithm : map, sort;
1487 
1488     static if (isWeighted) assert(cmdopt.hasWeightField);
1489     else assert(!cmdopt.hasWeightField);
1490 
1491     assert(cmdopt.sampleSize == 0);
1492 
1493     /*
1494      * Read all file data into memory. Then split the data into lines and assign a
1495      * random value to each line. readFileData also writes the first header line.
1496      */
1497     const fileData = readFileData!(Yes.hasRandomValue)(cmdopt, outputStream);
1498     auto inputLines = fileData.identifyInputLines!(Yes.hasRandomValue, isWeighted)(cmdopt);
1499 
1500     /*
1501      * Sort by the weight and output the lines.
1502      */
1503     inputLines.sort!((a, b) => a.randomValue > b.randomValue);
1504 
1505     foreach (lineEntry; inputLines)
1506     {
1507         if (cmdopt.printRandom)
1508         {
1509             outputStream.formatRandomValue(lineEntry.randomValue);
1510             outputStream.put(cmdopt.delim);
1511         }
1512         outputStream.put(lineEntry.data);
1513         outputStream.put("\n");
1514     }
1515 }
1516 
1517 /** Shuffle (randomize) all input lines using a shuffling algorithm.
1518  *
1519  * All lines in files and/or standard input are read in and written out in random
1520  * order. This routine uses array shuffling, which is faster than sorting. It is a
1521  * good alternative to randomizeLinesViaSort when doing unweighted shuffling (the
1522  * most common case).
1523  *
1524  * Input data size is limited by available memory. Disk oriented techniques are needed
1525  * when data sizes are larger. For example, generating random values line-by-line (ala
1526  * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort.
1527  *
1528  * This routine does not support random value printing or compatibility-mode.
1529  */
1530 void randomizeLinesViaShuffle(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1531 if (isOutputRange!(OutputRange, char))
1532 {
1533     import std.algorithm : map;
1534     import std.random : Random = Mt19937, randomShuffle;
1535 
1536     assert(cmdopt.sampleSize == 0);
1537     assert(!cmdopt.hasWeightField);
1538     assert(!cmdopt.printRandom);
1539     assert(!cmdopt.genRandomInorder);
1540 
1541     /*
1542      * Read all file data into memory and split into lines.
1543      */
1544     const fileData = readFileData!(No.hasRandomValue)(cmdopt, outputStream);
1545     auto inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt);
1546 
1547     /*
1548      * Randomly shuffle and print each line.
1549      *
1550      * Note: Also tried randomCover, but that was exceedingly slow.
1551      */
1552     import std.random : randomShuffle;
1553 
1554     auto randomGenerator = Random(cmdopt.seed);
1555     inputLines.randomShuffle(randomGenerator);
1556 
1557     foreach (ref line; inputLines)
1558     {
1559         outputStream.put(line.data);
1560         outputStream.put("\n");
1561     }
1562 }
1563 
1564 /** Simple random sampling with replacement.
1565  *
1566  * All lines in files and/or standard input are read in. Then random lines are selected
1567  * one at a time and output. Lines can be selected multiple times. This process continues
1568  * until the desired number of samples (--n|num) has been output. Output continues
1569  * indefinitely if a sample size was not provided.
1570  */
1571 void simpleRandomSamplingWithReplacement(OutputRange)
1572     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1573 if (isOutputRange!(OutputRange, char))
1574 {
1575     import std.algorithm : map;
1576     import std.random : Random = Mt19937, uniform;
1577 
1578     /*
1579      * Read all file data into memory and split the data into lines.
1580      */
1581     const fileData = readFileData!(No.hasRandomValue)(cmdopt, outputStream);
1582     const inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt);
1583 
1584     if (inputLines.length > 0)
1585     {
1586         auto randomGenerator = Random(cmdopt.seed);
1587 
1588         /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */
1589         size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize;
1590         while (numLeft != 0)
1591         {
1592             immutable size_t index = uniform(0, inputLines.length, randomGenerator);
1593             outputStream.put(inputLines[index].data);
1594             outputStream.put("\n");
1595             if (cmdopt.sampleSize != 0) numLeft--;
1596         }
1597     }
1598 }
1599 
1600 /** A container holding data read from a file or standard input.
1601  *
1602  * The InputBlock struct is used to represent a block of data read from a file or
1603  * standard input. An array of InputBlocks is returned by readFileData. Typically one
1604  * block per file. Multiple blocks are used for standard input and when the file size
1605  * cannot be determined. Individual lines are not allowed to span blocks. The blocks
1606  * allocated to an individual file are numbered starting with zero.
1607  *
1608  * See readFileData() for more information.
1609  */
1610 static struct InputBlock
1611 {
1612     string filename;          /// Original filename or path. "-" denotes standard input.
1613     size_t fileBlockNumber;   /// Zero-based block number for the file.
1614     char[] data;              /// The actual data. Newline terminated or last block for the file.
1615 }
1616 
1617 /** Read data from one or more files. This routine is used by algorithms needing to
1618  * read all data into memory.
1619  *
1620  * readFileData reads in all data from a set of files. Data is returned as an array
1621  * of InputBlock structs. Normally one InputBlock per file, sized to match the size
1622  * of the file. Standard input is read in one or more blocks, as are files whose size
1623  * cannot be determined. Multiple blocks are used in these last two cases to avoid
1624  * expensive memory reallocations. This is not necessary when file size is known as
1625  * the necessary memory can be preallocated.
1626  *
1627  * Individual lines never span multiple blocks, and newlines are preserved. This
1628  * means that each block starts at the beginning of a line and ends with a newline
1629  * unless the end of a file has been reached.
1630  *
1631  * Each file gets its own block. Prior to using InputSourceRange this was so header
1632  * processing can be done. With InputSourceRange the header is read separately, so
1633  * this could be changed.
1634  */
1635 InputBlock[] readFileData(HasRandomValue hasRandomValue, OutputRange)
1636 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1637 if (isOutputRange!(OutputRange, char))
1638 {
1639     import std.algorithm : find, min;
1640     import std.range : retro;
1641     import tsv_utils.common.utils : InputSourceRange, isFlushableOutputRange,
1642         throwIfWindowsNewline;
1643 
1644     static if(!hasRandomValue) assert(!cmdopt.printRandom);
1645 
1646     assert(!cmdopt.inputSources.empty);
1647     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1648 
1649     /* First header is read during command line argument processing. */
1650     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1651     {
1652         auto inputStream = cmdopt.inputSources.front;
1653 
1654         if (cmdopt.printRandom)
1655         {
1656             outputStream.put(cmdopt.randomValueHeader);
1657             outputStream.put(cmdopt.delim);
1658         }
1659         outputStream.put(inputStream.header);
1660         outputStream.put("\n");
1661 
1662         /* Immediately flush the header so subsequent processes in a unix command
1663          * pipeline see it early. This helps provide timely error messages.
1664          */
1665         static if (isFlushableOutputRange!OutputRange) outputStream.flush;
1666     }
1667 
1668     enum BlockSize = 1024L * 1024L * 1024L;  // 1 GB. ('L' notation avoids overflow w/ 2GB+ sizes.)
1669     enum ReadSize = 1024L * 128L;
1670     enum NewlineSearchSize = 1024L * 16L;
1671 
1672     InputBlock[] blocks;
1673     auto blocksAppender = appender(&blocks);
1674     blocksAppender.reserve(cmdopt.inputSources.length);  // At least one block per file.
1675 
1676     ubyte[] rawReadBuffer = new ubyte[ReadSize];
1677 
1678     foreach (inputStream; cmdopt.inputSources)
1679     {
1680         if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1);
1681 
1682         /* If the file size can be determined then read it as a single block.
1683          * Otherwise read as multiple blocks. File.size() returns ulong.max
1684          * if file size cannot be determined, so we'll combine that check
1685          * with the standard input case.
1686          */
1687 
1688         immutable ulong filesize = inputStream.isStdin ? ulong.max : inputStream.file.size;
1689         auto ifile = inputStream.file;
1690 
1691         if (filesize != ulong.max)
1692         {
1693             readFileDataAsOneBlock(inputStream.name, ifile, filesize,
1694                                    blocksAppender, rawReadBuffer);
1695         }
1696         else
1697         {
1698             readFileDataAsMultipleBlocks(
1699                 inputStream.name, ifile, blocksAppender, rawReadBuffer,
1700                 BlockSize, NewlineSearchSize);
1701         }
1702     }
1703     return blocks;
1704 }
1705 
1706 /* readFileData() helper function. Read data from a File handle as a single block. The
1707  * new block is appended to an existing InputBlock[] array.
1708  *
1709  * readFileDataAsOneBlocks is part of the readFileData logic. It handles the case
1710  * where a file is being read as a single block. Normally initialBlockSize is passed
1711  * as the size of the file.
1712  *
1713  * This routine has been separated out to enable unit testing. At present it is not
1714  * intended as a general API. See readFileData for more info.
1715  */
1716 private void readFileDataAsOneBlock(
1717     string filename,
1718     ref File ifile,
1719     const ulong initialBlockSize,
1720     ref RefAppender!(InputBlock[]) blocksAppender,
1721     ref ubyte[] rawReadBuffer)
1722 {
1723     blocksAppender.put(InputBlock(filename, 0));
1724     auto dataAppender = appender(&(blocksAppender.data[$-1].data));
1725     dataAppender.reserve(initialBlockSize);
1726 
1727     foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer))
1728     {
1729         dataAppender.put(cast(char[]) buffer);
1730     }
1731 }
1732 
1733 /* readFileData() helper function. Read data from a File handle as one or more blocks.
1734  * Blocks are appended to an existing InputBlock[] array.
1735  *
1736  * readFileDataAsMultipleBlocks is part of the readFileData logic. It handles the case
1737  * where a file or standard input is being read as a series of blocks. This is the
1738  * standard approach for standard input, but also applies when the file size cannot be
1739  * determined.
1740  *
1741  * This routine has been separated out to enable unit testing. At present it is not
1742  * intended as a general API. See readFileData for more info.
1743  */
1744 private void readFileDataAsMultipleBlocks(
1745     string filename,
1746     ref File ifile,
1747     ref RefAppender!(InputBlock[]) blocksAppender,
1748     ref ubyte[] rawReadBuffer,
1749     const size_t blockSize,
1750     const size_t newlineSearchSize)
1751 {
1752     import std.algorithm : find, min;
1753     import std.range : retro;
1754 
1755     assert(ifile.isOpen);
1756 
1757     /* Create a new block for the file and an Appender for writing data.
1758      */
1759     blocksAppender.put(InputBlock(filename, 0));
1760     auto dataAppender = appender(&(blocksAppender.data[$-1].data));
1761     dataAppender.reserve(blockSize);
1762     size_t blockNumber = 0;
1763 
1764     /* Read all the data and copy it to an InputBlock. */
1765     foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer))
1766     {
1767         assert(blockNumber == blocksAppender.data[$-1].fileBlockNumber);
1768 
1769         immutable size_t remainingCapacity = dataAppender.capacity - dataAppender.data.length;
1770 
1771         if (buffer.length <= remainingCapacity)
1772         {
1773             dataAppender.put(cast(char[]) buffer);
1774         }
1775         else
1776         {
1777             /* Look for the last newline in the input buffer that fits in remaining
1778              * capacity of the block.
1779              */
1780             auto searchRegion = buffer[0 .. remainingCapacity];
1781             auto appendRegion = searchRegion.retro.find('\n').source;
1782 
1783             if (appendRegion.length > 0)
1784             {
1785                 /* Copy the first part of the read buffer to the block. */
1786                 dataAppender.put(cast(char[]) appendRegion);
1787 
1788                 /* Create a new InputBlock and copy the remaining data to it. */
1789                 blockNumber++;
1790                 blocksAppender.put(InputBlock(filename, blockNumber));
1791                 dataAppender = appender(&(blocksAppender.data[$-1].data));
1792                 dataAppender.reserve(blockSize);
1793                 dataAppender.put(cast(char[]) buffer[appendRegion.length .. $]);
1794 
1795                 assert(blocksAppender.data.length >= 2);
1796                 assert(blocksAppender.data[$-2].data[$-1] == '\n');
1797             }
1798             else
1799             {
1800                 /* Search backward in the current block for a newline. If found, it
1801                  * becomes the last newline in the current block. Anything following
1802                  * it is moved to the block. If a newline is not found, simply append
1803                  * to the current block and let it grow. We'll only search backward
1804                  * so far.
1805                  */
1806                 immutable size_t currBlockLength = blocksAppender.data[$-1].data.length;
1807                 immutable size_t searchLength = min(currBlockLength, newlineSearchSize);
1808                 immutable size_t searchStart = currBlockLength - searchLength;
1809                 auto blockSearchRegion = blocksAppender.data[$-1].data[searchStart .. $];
1810                 auto lastNewlineOffset = blockSearchRegion.retro.find('\n').source.length;
1811 
1812                 if (lastNewlineOffset != 0)
1813                 {
1814                     /* Create a new InputBlock. The previous InputBlock is then found
1815                      * at blocksAppender.data[$-2]. It may be a physically different
1816                      * struct (a copy) if the blocks array gets reallocated.
1817                      */
1818                     blockNumber++;
1819                     blocksAppender.put(InputBlock(filename, blockNumber));
1820                     dataAppender = appender(&(blocksAppender.data[$-1].data));
1821                     dataAppender.reserve(blockSize);
1822 
1823                     /* Copy data following the newline from the last block to the new
1824                      * block. Then append the current read buffer.
1825                      */
1826                     immutable size_t moveRegionStart = searchStart + lastNewlineOffset;
1827                     dataAppender.put(blocksAppender.data[$-2].data[moveRegionStart .. $]);
1828                     dataAppender.put(cast(char[]) buffer);
1829 
1830                     /* Now delete the moved region from the last block. */
1831                     blocksAppender.data[$-2].data.length = moveRegionStart;
1832 
1833                     assert(blocksAppender.data.length >= 2);
1834                     assert(blocksAppender.data[$-2].data[$-1] == '\n');
1835                 }
1836                 else
1837                 {
1838                     /* Give up. Allow the current block to grow. */
1839                     dataAppender.put(cast(char[]) buffer);
1840                 }
1841             }
1842         }
1843     }
1844 }
1845 
1846 /** HasRandomValue is a boolean flag used at compile time by identifyInputLines to
1847  * distinguish use cases needing random value assignments from those that don't.
1848  */
1849 alias HasRandomValue = Flag!"hasRandomValue";
1850 
1851 /** An InputLine array is returned by identifyInputLines to represent each non-header line
1852  * line found in a FileData array. The 'data' element contains the line. A 'randomValue'
1853  * line is included if random values are being generated.
1854  */
1855 static struct InputLine(HasRandomValue hasRandomValue)
1856 {
1857     const(char)[] data;
1858     static if (hasRandomValue) double randomValue;
1859 }
1860 
1861 /** identifyInputLines is used by algorithms that read all files into memory prior to
1862  * processing. It does the initial processing of the file data.
1863  *
1864  * Two main tasks are performed. One is splitting all input data into lines. The second
1865  * is assigning a random value to the line, if random values are being generated.
1866  *
1867  * The key input is an InputBlock array. Normally one block for each file, but standard
1868  * input may have multiple blocks.
1869  *
1870  * The return value is an array of InputLine structs. The struct will have a 'randomValue'
1871  * member if random values are being assigned.
1872  */
1873 InputLine!hasRandomValue[] identifyInputLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted)
1874 (const ref InputBlock[] inputBlocks, ref TsvSampleOptions cmdopt)
1875 {
1876     import std.algorithm : splitter;
1877     import std.array : appender;
1878     import std.random : Random = Mt19937, uniform01;
1879     import tsv_utils.common.utils : throwIfWindowsNewline;
1880 
1881     static assert(hasRandomValue || !isWeighted);
1882     static if(!hasRandomValue) assert(!cmdopt.printRandom);
1883 
1884     InputLine!hasRandomValue[] inputLines;
1885 
1886     auto linesAppender = appender(&inputLines);
1887     static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed);
1888 
1889     /* Note: fileLineNum is zero-based here. One-based in most other code in this file. */
1890     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 1 : 0;
1891     size_t fileLineNum = fileBodyStartLine;
1892 
1893     foreach (block; inputBlocks)
1894     {
1895         /* Drop the last newline to avoid adding an extra empty line. */
1896         const data = (block.data.length > 0 && block.data[$-1] == '\n') ?
1897             block.data[0 .. $-1] : block.data;
1898 
1899         if (block.fileBlockNumber == 0) fileLineNum = fileBodyStartLine;
1900 
1901         foreach (ref line; data.splitter('\n'))
1902         {
1903             fileLineNum++;
1904 
1905             if (fileLineNum == 1) throwIfWindowsNewline(line, block.filename, fileLineNum);
1906 
1907             static if (!hasRandomValue)
1908             {
1909                 linesAppender.put(InputLine!hasRandomValue(line));
1910             }
1911             else
1912             {
1913                 static if (!isWeighted)
1914                 {
1915                     immutable double randomValue = uniform01(randomGenerator);
1916                 }
1917                 else
1918                 {
1919                     immutable double lineWeight =
1920                         getFieldValue!double(line, cmdopt.weightField, cmdopt.delim,
1921                                              block.filename, fileLineNum);
1922                     immutable double randomValue =
1923                         (lineWeight > 0.0)
1924                         ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1925                         : 0.0;
1926                 }
1927 
1928                 linesAppender.put(InputLine!hasRandomValue(line, randomValue));
1929             }
1930         }
1931     }
1932 
1933     return inputLines;
1934 }
1935 
1936 
1937 /* Unit tests for ReadFileData. These tests focus on multiple InputBlock scenarios.
1938  * Other use paths are well tested by the tests at the end cases.
1939  */
1940 unittest
1941 {
1942     import tsv_utils.common.unittest_utils;
1943     import std.algorithm : equal, find, joiner, splitter;
1944     import std.array : appender;
1945     import std.file : rmdirRecurse;
1946     import std.path : buildPath;
1947     import std.range : repeat;
1948 
1949     auto rfdTestDir = makeUnittestTempDir("tsv_sample_readFileData");
1950     scope(exit) rfdTestDir.rmdirRecurse;
1951 
1952     char[] file1Data;
1953     char[] file2Data;
1954     char[] file3Data;
1955 
1956     auto app1 = appender(&file1Data);
1957     auto app2 = appender(&file2Data);
1958     auto app3 = appender(&file3Data);
1959 
1960     /* File 1: 1000 short lines. */
1961     app1.put("\n".repeat(100).joiner);
1962     app1.put("x\n".repeat(100).joiner);
1963     app1.put("yz\n".repeat(100).joiner);
1964     app1.put("pqr\n".repeat(100).joiner);
1965     app1.put("a\nbc\ndef\n".repeat(100).joiner);
1966     app1.put('\n'.repeat(100));
1967     app1.put("z\n".repeat(100).joiner);
1968     app1.put("xy\n".repeat(100).joiner);
1969 
1970     /* File 2: 500 longer lines. */
1971     app2.put(
1972         "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n"
1973         .repeat(100)
1974         .joiner);
1975     app2.put(
1976         "|abcdefghijklmnopqrstuv|\n|0123456789|\n|0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ|\n|abcdefghijklmnopqrstuvwxyz|\n"
1977         .repeat(100)
1978         .joiner);
1979     app2.put(
1980          "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n"
1981         .repeat(100)
1982         .joiner);
1983 
1984     /* File 3: 1000 mixed length lines. */
1985     app3.put("\n\n|abcde|\n1\n12\n123\n|abcdefghijklmnop|\n|xyz|\n0123456789\nX\n".repeat(100).joiner);
1986 
1987     string file1Path = buildPath(rfdTestDir, "file1.txt");
1988     string file2Path = buildPath(rfdTestDir, "file2.txt");
1989     string file3Path = buildPath(rfdTestDir, "file3.txt");
1990 
1991     try
1992     {
1993         auto ofile1 = File(file1Path, "wb");
1994         ofile1.write(file1Data);
1995         ofile1.close;
1996     }
1997     catch (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file1Path, e.msg));
1998 
1999     try
2000     {
2001         auto ofile2 = File(file2Path, "wb");
2002         ofile2.write(file2Data);
2003         ofile2.close;
2004     }
2005     catch (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file2Path, e.msg));
2006 
2007     try
2008     {
2009         auto ofile3 = File(file3Path, "wb");
2010         ofile3.write(file3Data);
2011         ofile3.close;
2012     }
2013     catch  (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file3Path, e.msg));
2014 
2015     auto allData = file1Data ~ file2Data ~ file3Data;
2016     auto expectedLines = allData.splitter('\n').array[0 .. $-1];
2017 
2018     auto file2DataNoHeader = (file2Data.find('\n'))[1 .. $];
2019     auto file3DataNoHeader = (file3Data.find('\n'))[1 .. $];
2020     auto allDataUsingHeader = file1Data ~ file2DataNoHeader ~ file3DataNoHeader;
2021     auto expectedLinesUsingHeader = allDataUsingHeader.splitter('\n').array[0 .. $-1];
2022 
2023     assert(expectedLines.length == expectedLinesUsingHeader.length + 2);
2024 
2025     /* We need real files for creating command line arg structs.
2026      */
2027     string file1Copy1Path = buildPath(rfdTestDir, "file1_copy1.txt");
2028     string file1Copy2Path = buildPath(rfdTestDir, "file1_copy2.txt");
2029 
2030     try
2031     {
2032         auto ofile = File(file1Copy1Path, "wb");
2033         ofile.write(file1Data);
2034         ofile.close;
2035     }
2036     catch (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file1Copy1Path, e.msg));
2037 
2038     try
2039     {
2040         auto ofile = File(file1Copy2Path, "wb");
2041         ofile.write(file1Data);
2042         ofile.close;
2043     }
2044     catch (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file1Copy2Path, e.msg));
2045 
2046     TsvSampleOptions cmdoptNoHeader;
2047     auto noHeaderCmdArgs = ["unittest", file1Copy1Path];
2048     auto r1 = cmdoptNoHeader.processArgs(noHeaderCmdArgs);
2049     assert(r1[0], format("Invalid command lines arg: '%s'.", noHeaderCmdArgs));
2050 
2051     TsvSampleOptions cmdoptYesHeader;
2052     auto yesHeaderCmdArgs = ["unittest", "--header", file1Copy2Path];
2053     auto r2 = cmdoptYesHeader.processArgs(yesHeaderCmdArgs);
2054     assert(r2[0], format("Invalid command lines arg: '%s'.", yesHeaderCmdArgs));
2055 
2056     scope (exit)
2057     {
2058         /* Close the files being used by the cmdopt[yes|no]Header structs. */
2059         while (!cmdoptNoHeader.inputSources.empty) cmdoptNoHeader.inputSources.popFront;
2060         while (!cmdoptYesHeader.inputSources.empty) cmdoptYesHeader.inputSources.popFront;
2061     }
2062 
2063     auto outputStream = appender!(char[])();
2064 
2065     {
2066         /* Reading as single blocks. */
2067         ubyte[] rawReadBuffer = new ubyte[256];
2068         InputBlock[] blocks;
2069         auto blocksAppender = appender(&blocks);
2070         blocksAppender.reserve(3);
2071         foreach (f; [ file1Path, file2Path, file3Path ])
2072         {
2073             auto ifile = f.File("rb");
2074             ulong filesize = ifile.size;
2075             if (filesize == ulong.max) filesize = 1000;
2076             readFileDataAsOneBlock(f, ifile, filesize, blocksAppender, rawReadBuffer);
2077             ifile.close;
2078         }
2079         auto inputLines =
2080             identifyInputLines!(No.hasRandomValue, No.isWeighted)(
2081                 blocks, cmdoptNoHeader);
2082 
2083         assert(equal!((a, b) => a.data == b)(inputLines, expectedLines));
2084     }
2085 
2086     {
2087         /* Reading as multiple blocks. */
2088         foreach (size_t searchSize; [ 0, 1, 2, 64 ])
2089         {
2090             foreach (size_t blockSize; [ 1, 2, 16, 64, 256 ])
2091             {
2092                 foreach (size_t readSize; [ 1, 2, 8, 32 ])
2093                 {
2094                     ubyte[] rawReadBuffer = new ubyte[readSize];
2095                     InputBlock[] blocks;
2096                     auto blocksAppender = appender(&blocks);
2097                     blocksAppender.reserve(3);
2098                     foreach (f; [ file1Path, file2Path, file3Path ])
2099                     {
2100                         auto ifile = f.File("rb");
2101                         readFileDataAsMultipleBlocks(f, ifile, blocksAppender,
2102                                                      rawReadBuffer, blockSize, searchSize);
2103                         ifile.close;
2104                     }
2105                     auto inputLines =
2106                         identifyInputLines!(No.hasRandomValue, No.isWeighted)(
2107                             blocks, cmdoptNoHeader);
2108 
2109                     assert(equal!((a, b) => a.data == b)(inputLines, expectedLines));
2110                 }
2111             }
2112         }
2113     }
2114     version(none) {
2115     {
2116         /* Reading as multiple blocks, with header processing. */
2117         const size_t readSize = 32;
2118         const size_t blockSize = 48;
2119         const size_t searchSize = 16;
2120 
2121         ubyte[] rawReadBuffer = new ubyte[readSize];
2122         InputBlock[] blocks;
2123         auto blocksAppender = appender(&blocks);
2124         blocksAppender.reserve(3);
2125         foreach (f; [ file1Path, file2Path, file3Path ])
2126         {
2127             auto ifile = f.File("rb");
2128             readFileDataAsMultipleBlocks(f, ifile, blocksAppender,
2129                                          rawReadBuffer, blockSize, searchSize);
2130             ifile.close;
2131         }
2132         auto inputLines =
2133             identifyInputLines!(No.hasRandomValue, No.isWeighted)(
2134                 blocks, cmdoptYesHeader);
2135 
2136         assert(outputStream.data == expectedLinesUsingHeader[0] ~ '\n');
2137         assert(equal!((a, b) => a.data == b)(inputLines, expectedLinesUsingHeader[1 .. $]));
2138     }
2139     }
2140 }
2141 
2142 /** Write a floating point random value to an output stream.
2143  *
2144  * This routine is used for floating point random value printing. This routine writes
2145  * 17 significant digits, the range available in doubles. This routine prefers decimal
2146  * format, without exponents. It will generate somewhat large precision numbers,
2147  * currently up to 28 digits, before switching to exponents.
2148  *
2149  * The primary reason for this approach is to enable faster sorting on random values
2150  * by GNU sort and similar external sorting programs. GNU sort is dramatically faster
2151  * on decimal format numeric sorts ('n' switch) than general numeric sorts ('g' switch).
2152  * The 'general numeric' handles exponential notation. The difference is 5-10x.
2153  *
2154  * Random values generated by Bernoulli sampling are nearly always greater than 1e-12.
2155  * No examples less than 1e-09 were seen in hundred of millions of trials. Similar
2156  * results were seen with weighted sampling with integer weights. The same is not true
2157  * with floating point weights. These produce quite large exponents. However, even
2158  * for floating point weights this can be useful. For random weights [0,1] less than 5%
2159  * will be less than 1e-12 and use exponential notation.
2160  */
2161 void formatRandomValue(OutputRange)(auto ref OutputRange outputStream, double value)
2162 if (isOutputRange!(OutputRange, char))
2163 {
2164     import std.format : formatValue, singleSpec;
2165 
2166     immutable spec17f = singleSpec("%.17f");
2167     immutable spec18f = singleSpec("%.18f");
2168     immutable spec19f = singleSpec("%.19f");
2169     immutable spec20f = singleSpec("%.20f");
2170     immutable spec21f = singleSpec("%.21f");
2171     immutable spec22f = singleSpec("%.22f");
2172     immutable spec23f = singleSpec("%.23f");
2173     immutable spec24f = singleSpec("%.24f");
2174     immutable spec25f = singleSpec("%.25f");
2175     immutable spec26f = singleSpec("%.26f");
2176     immutable spec27f = singleSpec("%.27f");
2177     immutable spec28f = singleSpec("%.28f");
2178 
2179     immutable spec17g = singleSpec("%.17g");
2180 
2181     immutable formatSpec =
2182         (value >= 1e-01) ? spec17f :
2183         (value >= 1e-02) ? spec18f :
2184         (value >= 1e-03) ? spec19f :
2185         (value >= 1e-04) ? spec20f :
2186         (value >= 1e-05) ? spec21f :
2187         (value >= 1e-06) ? spec22f :
2188         (value >= 1e-07) ? spec23f :
2189         (value >= 1e-08) ? spec24f :
2190         (value >= 1e-09) ? spec25f :
2191         (value >= 1e-10) ? spec26f :
2192         (value >= 1e-11) ? spec27f :
2193         (value >= 1e-12) ? spec28f : spec17g;
2194 
2195     outputStream.formatValue(value, formatSpec);
2196 }
2197 
2198 @safe unittest
2199 {
2200     void testFormatValue(double value, string expected)
2201     {
2202         import std.array : appender;
2203 
2204         auto s = appender!string();
2205         s.formatRandomValue(value);
2206         assert(s.data == expected,
2207                format("[testFormatValue] value: %g; expected: %s; actual: %s", value, expected, s.data));
2208     }
2209 
2210     testFormatValue(1.0,   "1.00000000000000000");
2211     testFormatValue(0.1,   "0.10000000000000001");
2212     testFormatValue(0.01,  "0.010000000000000000");
2213     testFormatValue(1e-03, "0.0010000000000000000");
2214     testFormatValue(1e-04, "0.00010000000000000000");
2215     testFormatValue(1e-05, "0.000010000000000000001");
2216     testFormatValue(1e-06, "0.0000010000000000000000");
2217     testFormatValue(1e-07, "0.00000010000000000000000");
2218     testFormatValue(1e-08, "0.000000010000000000000000");
2219     testFormatValue(1e-09, "0.0000000010000000000000001");
2220     testFormatValue(1e-10, "0.00000000010000000000000000");
2221     testFormatValue(1e-11, "0.000000000009999999999999999");
2222     testFormatValue(1e-12, "0.0000000000010000000000000000");
2223     testFormatValue(1e-13, "1e-13");
2224     testFormatValue(1e-14, "1e-14");
2225     testFormatValue(12345678901234567e-15, "12.34567890123456735");
2226     testFormatValue(12345678901234567e-16, "1.23456789012345669");
2227     testFormatValue(12345678901234567e-17, "0.12345678901234566");
2228     testFormatValue(12345678901234567e-18, "0.012345678901234567");
2229     testFormatValue(12345678901234567e-19, "0.0012345678901234567");
2230     testFormatValue(12345678901234567e-20, "0.00012345678901234567");
2231     testFormatValue(12345678901234567e-21, "0.000012345678901234568");
2232     testFormatValue(12345678901234567e-22, "0.0000012345678901234567");
2233     testFormatValue(12345678901234567e-23, "0.00000012345678901234566");
2234     testFormatValue(12345678901234567e-24, "0.000000012345678901234567");
2235     testFormatValue(12345678901234567e-25, "0.0000000012345678901234566");
2236     testFormatValue(12345678901234567e-26, "0.00000000012345678901234568");
2237     testFormatValue(12345678901234567e-27, "0.000000000012345678901234567");
2238     testFormatValue(12345678901234567e-28, "0.0000000000012345678901234567");
2239     testFormatValue(12345678901234567e-29, "1.2345678901234566e-13");
2240 }
2241 
2242 /** Convenience function for extracting a single field from a line. See
2243  * [tsv_utils.common.utils.getTsvFieldValue] for details. This wrapper creates error
2244  * text tailored for this program.
2245  */
2246 import std.traits : isSomeChar;
2247 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, ulong lineNum) pure @safe
2248 if (isSomeChar!C)
2249 {
2250     import std.conv : ConvException, to;
2251     import tsv_utils.common.utils : getTsvFieldValue;
2252 
2253     T val;
2254     try
2255     {
2256         val = getTsvFieldValue!T(line, fieldIndex, delim);
2257     }
2258     catch (ConvException exc)
2259     {
2260         throw new Exception(
2261             format("Could not process line: %s\n  File: %s Line: %s%s",
2262                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
2263                    (lineNum == 1) ? "\n  Is this a header line? Use --H|header to skip." : ""));
2264     }
2265     catch (Exception exc)
2266     {
2267         /* Not enough fields on the line. */
2268         throw new Exception(
2269             format("Could not process line: %s\n  File: %s Line: %s",
2270                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum));
2271     }
2272 
2273     return val;
2274 }
2275 
2276 @safe unittest
2277 {
2278     /* getFieldValue unit tests. getTsvFieldValue has it's own tests.
2279      * These tests make basic sanity checks on the getFieldValue wrapper.
2280      */
2281     import std.exception;
2282 
2283     assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123);
2284     assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4);
2285     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1));
2286     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2));
2287     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1));
2288     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2));
2289 }
2290 
2291 /* Unit tests for the main program start here.
2292  *
2293  * Portability note: Many of the tests here rely on generating consistent random numbers
2294  * across different platforms when using the same random seed. So far this has succeeded
2295  * on several different platform, compiler, and library versions. However, it is certainly
2296  * possible this condition will not hold on other platforms.
2297  *
2298  * For tsv-sample, this portability implies generating the same results on different
2299  * platforms when using the same random seed. This is NOT part of tsv-sample guarantees,
2300  * but it is convenient for testing. If platforms are identified that do not generate
2301  * the same results these tests will need to be adjusted.
2302  */
2303 version(unittest)
2304 {
2305     /* Unit test helper functions. */
2306 
2307     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
2308     import std.conv : to;
2309 
2310     void testTsvSample(string[] cmdArgs, string[][] expected)
2311     {
2312         import std.array : appender;
2313 
2314         assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty.");
2315 
2316         auto formatAssertMessage(T...)(string msg, T formatArgs)
2317         {
2318             auto formatString = "[testTsvSample] %s: " ~ msg;
2319             return format(formatString, cmdArgs[0], formatArgs);
2320         }
2321 
2322         TsvSampleOptions cmdopt;
2323         auto savedCmdArgs = cmdArgs.to!string;
2324         auto r = cmdopt.processArgs(cmdArgs);
2325         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
2326         auto output = appender!(char[])();
2327 
2328         tsvSample(cmdopt, output);    // This invokes the main code line.
2329 
2330         auto expectedOutput = expected.tsvDataToString;
2331 
2332         assert(output.data == expectedOutput,
2333                formatAssertMessage(
2334                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
2335                    expectedOutput.to!string, output.data.to!string));
2336     }
2337  }
2338 
2339 unittest
2340 {
2341     import std.path : buildPath;
2342     import std.file : rmdirRecurse;
2343 
2344     auto testDir = makeUnittestTempDir("tsv_sample");
2345     scope(exit) testDir.rmdirRecurse;
2346 
2347     /* Tabular data sets and expected results use the built-in static seed.
2348      * Tests are run by writing the data set to a file, then calling the main
2349      * routine to process. The function testTsvSample plays the role of the
2350      * main program. Rather than writing to expected output, the results are
2351      * matched against expected. The expected results were verified by hand
2352      * prior to inclusion in the test.
2353      *
2354      * The initial part of this section is simply setting up data files and
2355      * expected results.
2356      *
2357      * Expected results naming conventions:
2358      *  - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected
2359      *  - Sampling Type (required): Permute (Shuffle), Sample, Replace, Bernoulli, Distinct
2360      *  - Compatibility: Compat, AlgoR, Skip, Swap, Inorder
2361      *  - Weight Field: Wt<num>, e.g. Wt3
2362      *  - Sample Size: Num<num>, eg. Num3
2363      *  - Seed Value: V<num>, eg. V77
2364      *  - Key Field: K<num>, e.g. K2
2365      *  - Probability: P<num>, e.g P05 (5%)
2366      *  - Printing Probabilities: Probs
2367      *  - Printing Probs in order: ProbsInorder
2368      *  - Printing Probs with custom header: RVCustom
2369      */
2370 
2371     /* Empty file. */
2372     string[][] dataEmpty = [];
2373     string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv");
2374     writeUnittestTsvFile(fpath_dataEmpty, dataEmpty);
2375 
2376     /* 3x0, header only. */
2377     string[][] data3x0 = [["field_a", "field_b", "field_c"]];
2378     string fpath_data3x0 = buildPath(testDir, "data3x0.tsv");
2379     writeUnittestTsvFile(fpath_data3x0, data3x0);
2380 
2381     /* 3x1 */
2382     string[][] data3x1 =
2383         [["field_a", "field_b", "field_c"],
2384          ["tan", "タン", "8.5"]];
2385 
2386     string fpath_data3x1 = buildPath(testDir, "data3x1.tsv");
2387     string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv");
2388     writeUnittestTsvFile(fpath_data3x1, data3x1);
2389     writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1 .. $]);
2390 
2391     string[][] data3x1ExpectedReplaceNum3 =
2392         [["field_a", "field_b", "field_c"],
2393          ["tan", "タン", "8.5"],
2394          ["tan", "タン", "8.5"],
2395          ["tan", "タン", "8.5"]];
2396 
2397     /* 3x2 */
2398     string[][] data3x2 =
2399         [["field_a", "field_b", "field_c"],
2400          ["brown", "褐色", "29.2"],
2401          ["gray", "グレー", "6.2"]];
2402 
2403     string fpath_data3x2 = buildPath(testDir, "data3x2.tsv");
2404     string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv");
2405     writeUnittestTsvFile(fpath_data3x2, data3x2);
2406     writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1 .. $]);
2407 
2408     string[][] data3x2PermuteCompat =
2409         [["field_a", "field_b", "field_c"],
2410          ["gray", "グレー", "6.2"],
2411          ["brown", "褐色", "29.2"]];
2412 
2413     string[][] data3x2PermuteShuffle =
2414         [["field_a", "field_b", "field_c"],
2415          ["gray", "グレー", "6.2"],
2416          ["brown", "褐色", "29.2"]];
2417 
2418     /* 3x3 */
2419     string[][] data3x3 =
2420         [["field_a", "field_b", "field_c"],
2421          ["orange", "オレンジ", "2.5"],
2422          ["pink", "ピンク", "1.1"],
2423          ["purple", "紫の", "42"]];
2424 
2425     string fpath_data3x3 = buildPath(testDir, "data3x3.tsv");
2426     string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv");
2427     writeUnittestTsvFile(fpath_data3x3, data3x3);
2428     writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1 .. $]);
2429 
2430     string[][] data3x3ExpectedPermuteCompat =
2431         [["field_a", "field_b", "field_c"],
2432          ["purple", "紫の", "42"],
2433          ["pink", "ピンク", "1.1"],
2434          ["orange", "オレンジ", "2.5"]];
2435 
2436     string[][] data3x3ExpectedPermuteSwap =
2437         [["field_a", "field_b", "field_c"],
2438          ["purple", "紫の", "42"],
2439          ["orange", "オレンジ", "2.5"],
2440          ["pink", "ピンク", "1.1"]];
2441 
2442     /* 3x6 */
2443     string[][] data3x6 =
2444         [["field_a", "field_b", "field_c"],
2445          ["red", "赤", "23.8"],
2446          ["green", "緑", "0.0072"],
2447          ["white", "白", "1.65"],
2448          ["yellow", "黄", "12"],
2449          ["blue", "青", "12"],
2450          ["black", "黒", "0.983"]];
2451     string fpath_data3x6 = buildPath(testDir, "data3x6.tsv");
2452     string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv");
2453     writeUnittestTsvFile(fpath_data3x6, data3x6);
2454     writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1 .. $]);
2455 
2456     // Randomization, all lines
2457     string[][] data3x6ExpectedPermuteCompat =
2458         [["field_a", "field_b", "field_c"],
2459          ["yellow", "黄", "12"],
2460          ["black", "黒", "0.983"],
2461          ["blue", "青", "12"],
2462          ["white", "白", "1.65"],
2463          ["green", "緑", "0.0072"],
2464          ["red", "赤", "23.8"]];
2465 
2466     string[][] data3x6ExpectedPermuteSwap =
2467         [["field_a", "field_b", "field_c"],
2468          ["black", "黒", "0.983"],
2469          ["green", "緑", "0.0072"],
2470          ["red", "赤", "23.8"],
2471          ["yellow", "黄", "12"],
2472          ["white", "白", "1.65"],
2473          ["blue", "青", "12"]];
2474 
2475     string[][] data3x6ExpectedPermuteCompatProbs =
2476         [["random_value", "field_a", "field_b", "field_c"],
2477          ["0.96055546286515892", "yellow", "黄", "12"],
2478          ["0.75710153928957880", "black", "黒", "0.983"],
2479          ["0.52525980887003243", "blue", "青", "12"],
2480          ["0.49287854949943721", "white", "白", "1.65"],
2481          ["0.15929344086907804", "green", "緑", "0.0072"],
2482          ["0.010968807619065046", "red", "赤", "23.8"]];
2483 
2484     /* Note: data3x6ExpectedSampleAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because
2485      * both are effectively the same algorithm given that --num is data length. Both read
2486      * in the full data in order then call randomShuffle.
2487      */
2488     string[][] data3x6ExpectedSampleAlgoRNum6 =
2489         [["field_a", "field_b", "field_c"],
2490          ["black", "黒", "0.983"],
2491          ["green", "緑", "0.0072"],
2492          ["red", "赤", "23.8"],
2493          ["yellow", "黄", "12"],
2494          ["white", "白", "1.65"],
2495          ["blue", "青", "12"]];
2496 
2497     string[][] data3x6ExpectedSampleAlgoRNum5 =
2498         [["field_a", "field_b", "field_c"],
2499          ["red", "赤", "23.8"],
2500          ["black", "黒", "0.983"],
2501          ["white", "白", "1.65"],
2502          ["green", "緑", "0.0072"],
2503          ["yellow", "黄", "12"]];
2504 
2505     string[][] data3x6ExpectedSampleAlgoRNum4 =
2506         [["field_a", "field_b", "field_c"],
2507          ["blue", "青", "12"],
2508          ["green", "緑", "0.0072"],
2509          ["black", "黒", "0.983"],
2510          ["white", "白", "1.65"]];
2511 
2512     string[][] data3x6ExpectedSampleAlgoRNum3 =
2513         [["field_a", "field_b", "field_c"],
2514          ["red", "赤", "23.8"],
2515          ["black", "黒", "0.983"],
2516          ["green", "緑", "0.0072"]];
2517 
2518     string[][] data3x6ExpectedSampleAlgoRNum2 =
2519         [["field_a", "field_b", "field_c"],
2520          ["black", "黒", "0.983"],
2521          ["red", "赤", "23.8"]];
2522 
2523     string[][] data3x6ExpectedSampleAlgoRNum1 =
2524         [["field_a", "field_b", "field_c"],
2525          ["green", "緑", "0.0072"]];
2526 
2527     /* Inorder versions. */
2528     string[][] data3x6ExpectedSampleAlgoRNum6Inorder =
2529         [["field_a", "field_b", "field_c"],
2530          ["red", "赤", "23.8"],
2531          ["green", "緑", "0.0072"],
2532          ["white", "白", "1.65"],
2533          ["yellow", "黄", "12"],
2534          ["blue", "青", "12"],
2535          ["black", "黒", "0.983"]];
2536 
2537     string[][] data3x6ExpectedSampleAlgoRNum5Inorder =
2538         [["field_a", "field_b", "field_c"],
2539          ["red", "赤", "23.8"],
2540          ["green", "緑", "0.0072"],
2541          ["white", "白", "1.65"],
2542          ["yellow", "黄", "12"],
2543          ["black", "黒", "0.983"]];
2544 
2545     string[][] data3x6ExpectedSampleAlgoRNum4Inorder =
2546         [["field_a", "field_b", "field_c"],
2547          ["green", "緑", "0.0072"],
2548          ["white", "白", "1.65"],
2549          ["blue", "青", "12"],
2550          ["black", "黒", "0.983"]];
2551 
2552     string[][] data3x6ExpectedSampleAlgoRNum3Inorder =
2553         [["field_a", "field_b", "field_c"],
2554          ["red", "赤", "23.8"],
2555          ["green", "緑", "0.0072"],
2556          ["black", "黒", "0.983"]];
2557 
2558     string[][] data3x6ExpectedSampleAlgoRNum2Inorder =
2559         [["field_a", "field_b", "field_c"],
2560          ["red", "赤", "23.8"],
2561          ["black", "黒", "0.983"]];
2562 
2563     string[][] data3x6ExpectedSampleAlgoRNum1Inorder =
2564         [["field_a", "field_b", "field_c"],
2565          ["green", "緑", "0.0072"]];
2566 
2567     /* Reservoir inorder */
2568     string[][] data3x6ExpectedSampleCompatNum6Inorder =
2569         [["field_a", "field_b", "field_c"],
2570          ["red", "赤", "23.8"],
2571          ["green", "緑", "0.0072"],
2572          ["white", "白", "1.65"],
2573          ["yellow", "黄", "12"],
2574          ["blue", "青", "12"],
2575          ["black", "黒", "0.983"]];
2576 
2577     string[][] data3x6ExpectedSampleCompatNum5Inorder =
2578         [["field_a", "field_b", "field_c"],
2579          ["green", "緑", "0.0072"],
2580          ["white", "白", "1.65"],
2581          ["yellow", "黄", "12"],
2582          ["blue", "青", "12"],
2583          ["black", "黒", "0.983"]];
2584 
2585     string[][] data3x6ExpectedSampleCompatNum4Inorder =
2586         [["field_a", "field_b", "field_c"],
2587          ["white", "白", "1.65"],
2588          ["yellow", "黄", "12"],
2589          ["blue", "青", "12"],
2590          ["black", "黒", "0.983"]];
2591 
2592     string[][] data3x6ExpectedSampleCompatNum3Inorder =
2593         [["field_a", "field_b", "field_c"],
2594          ["yellow", "黄", "12"],
2595          ["blue", "青", "12"],
2596          ["black", "黒", "0.983"]];
2597 
2598     string[][] data3x6ExpectedSampleCompatNum2Inorder =
2599         [["field_a", "field_b", "field_c"],
2600          ["yellow", "黄", "12"],
2601          ["black", "黒", "0.983"]];
2602 
2603     string[][] data3x6ExpectedSampleCompatNum1Inorder =
2604         [["field_a", "field_b", "field_c"],
2605          ["yellow", "黄", "12"]];
2606 
2607 
2608     /* Reservoir inorder with probabilities. */
2609     string[][] data3x6ExpectedSampleCompatNum6ProbsInorder =
2610         [["random_value", "field_a", "field_b", "field_c"],
2611          ["0.010968807619065046", "red", "赤", "23.8"],
2612          ["0.15929344086907804", "green", "緑", "0.0072"],
2613          ["0.49287854949943721", "white", "白", "1.65"],
2614          ["0.96055546286515892", "yellow", "黄", "12"],
2615          ["0.52525980887003243", "blue", "青", "12"],
2616          ["0.75710153928957880", "black", "黒", "0.983"]];
2617 
2618     string[][] data3x6ExpectedSampleCompatNum5ProbsInorder =
2619         [["random_value", "field_a", "field_b", "field_c"],
2620          ["0.15929344086907804", "green", "緑", "0.0072"],
2621          ["0.49287854949943721", "white", "白", "1.65"],
2622          ["0.96055546286515892", "yellow", "黄", "12"],
2623          ["0.52525980887003243", "blue", "青", "12"],
2624          ["0.75710153928957880", "black", "黒", "0.983"]];
2625 
2626     string[][] data3x6ExpectedSampleCompatNum4ProbsInorder =
2627         [["random_value", "field_a", "field_b", "field_c"],
2628          ["0.49287854949943721", "white", "白", "1.65"],
2629          ["0.96055546286515892", "yellow", "黄", "12"],
2630          ["0.52525980887003243", "blue", "青", "12"],
2631          ["0.75710153928957880", "black", "黒", "0.983"]];
2632 
2633     string[][] data3x6ExpectedSampleCompatNum3ProbsInorder =
2634         [["random_value", "field_a", "field_b", "field_c"],
2635          ["0.96055546286515892", "yellow", "黄", "12"],
2636          ["0.52525980887003243", "blue", "青", "12"],
2637          ["0.75710153928957880", "black", "黒", "0.983"]];
2638 
2639     string[][] data3x6ExpectedSampleCompatNum2ProbsInorder =
2640         [["random_value", "field_a", "field_b", "field_c"],
2641          ["0.96055546286515892", "yellow", "黄", "12"],
2642          ["0.75710153928957880", "black", "黒", "0.983"]];
2643 
2644     string[][] data3x6ExpectedSampleCompatNum1ProbsInorder =
2645         [["random_value", "field_a", "field_b", "field_c"],
2646          ["0.96055546286515892", "yellow", "黄", "12"]];
2647 
2648     string[][] data3x6ExpectedWt3Num6Inorder =
2649         [["field_a", "field_b", "field_c"],
2650          ["red", "赤", "23.8"],
2651          ["green", "緑", "0.0072"],
2652          ["white", "白", "1.65"],
2653          ["yellow", "黄", "12"],
2654          ["blue", "青", "12"],
2655          ["black", "黒", "0.983"]];
2656 
2657     string[][] data3x6ExpectedWt3Num5Inorder =
2658         [["field_a", "field_b", "field_c"],
2659          ["green", "緑", "0.0072"],
2660          ["white", "白", "1.65"],
2661          ["yellow", "黄", "12"],
2662          ["blue", "青", "12"],
2663          ["black", "黒", "0.983"]];
2664 
2665     string[][] data3x6ExpectedWt3Num4Inorder =
2666         [["field_a", "field_b", "field_c"],
2667          ["white", "白", "1.65"],
2668          ["yellow", "黄", "12"],
2669          ["blue", "青", "12"],
2670          ["black", "黒", "0.983"]];
2671 
2672     string[][] data3x6ExpectedWt3Num3Inorder =
2673         [["field_a", "field_b", "field_c"],
2674          ["yellow", "黄", "12"],
2675          ["blue", "青", "12"],
2676          ["black", "黒", "0.983"]];
2677 
2678     string[][] data3x6ExpectedWt3Num2Inorder =
2679         [["field_a", "field_b", "field_c"],
2680          ["yellow", "黄", "12"],
2681          ["black", "黒", "0.983"]];
2682 
2683     string[][] data3x6ExpectedWt3Num1Inorder =
2684         [["field_a", "field_b", "field_c"],
2685          ["yellow", "黄", "12"]];
2686 
2687 
2688     string[][] data3x6ExpectedBernoulliProbsP100 =
2689         [["random_value", "field_a", "field_b", "field_c"],
2690          ["0.010968807619065046", "red", "赤", "23.8"],
2691          ["0.15929344086907804", "green", "緑", "0.0072"],
2692          ["0.49287854949943721", "white", "白", "1.65"],
2693          ["0.96055546286515892", "yellow", "黄", "12"],
2694          ["0.52525980887003243", "blue", "青", "12"],
2695          ["0.75710153928957880", "black", "黒", "0.983"]];
2696 
2697     string[][] data3x6ExpectedBernoulliCompatProbsP60 =
2698         [["random_value", "field_a", "field_b", "field_c"],
2699          ["0.010968807619065046", "red", "赤", "23.8"],
2700          ["0.15929344086907804", "green", "緑", "0.0072"],
2701          ["0.49287854949943721", "white", "白", "1.65"],
2702          ["0.52525980887003243", "blue", "青", "12"]];
2703 
2704     string[][] data3x6ExpectedBernoulliSkipP40 =
2705         [["field_a", "field_b", "field_c"],
2706          ["red", "赤", "23.8"],
2707          ["green", "緑", "0.0072"],
2708          ["yellow", "黄", "12"]];
2709 
2710     string[][] data3x6ExpectedBernoulliCompatP60 =
2711         [["field_a", "field_b", "field_c"],
2712          ["red", "赤", "23.8"],
2713          ["green", "緑", "0.0072"],
2714          ["white", "白", "1.65"],
2715          ["blue", "青", "12"]];
2716 
2717     string[][] data3x6ExpectedDistinctK1K3P60 =
2718         [["field_a", "field_b", "field_c"],
2719          ["green", "緑", "0.0072"],
2720          ["white", "白", "1.65"],
2721          ["blue", "青", "12"]];
2722 
2723     string[][] data3x6ExpectedDistinctK1K3P60Probs =
2724         [["random_value", "field_a", "field_b", "field_c"],
2725          ["0", "green", "緑", "0.0072"],
2726          ["0", "white", "白", "1.65"],
2727          ["0", "blue", "青", "12"]];
2728 
2729     string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom =
2730         [["custom_random_value_header", "field_a", "field_b", "field_c"],
2731          ["0", "green", "緑", "0.0072"],
2732          ["0", "white", "白", "1.65"],
2733          ["0", "blue", "青", "12"]];
2734 
2735     string[][] data3x6ExpectedDistinctK2P2ProbsInorder =
2736         [["random_value", "field_a", "field_b", "field_c"],
2737          ["1", "red", "赤", "23.8"],
2738          ["0", "green", "緑", "0.0072"],
2739          ["0", "white", "白", "1.65"],
2740          ["1", "yellow", "黄", "12"],
2741          ["3", "blue", "青", "12"],
2742          ["2", "black", "黒", "0.983"]];
2743 
2744     string[][] data3x6ExpectedPermuteWt3Probs =
2745         [["random_value", "field_a", "field_b", "field_c"],
2746          ["0.99665198757645390", "yellow", "黄", "12"],
2747          ["0.94775884809836686", "blue", "青", "12"],
2748          ["0.82728234682286661", "red", "赤", "23.8"],
2749          ["0.75346697377181959", "black", "黒", "0.983"],
2750          ["0.65130103496422487", "white", "白", "1.65"],
2751          ["1.5636943712879866e-111", "green", "緑", "0.0072"]];
2752 
2753     string[][] data3x6ExpectedWt3ProbsInorder =
2754         [["random_value", "field_a", "field_b", "field_c"],
2755          ["0.82728234682286661", "red", "赤", "23.8"],
2756          ["1.5636943712879866e-111", "green", "緑", "0.0072"],
2757          ["0.65130103496422487", "white", "白", "1.65"],
2758          ["0.99665198757645390", "yellow", "黄", "12"],
2759          ["0.94775884809836686", "blue", "青", "12"],
2760          ["0.75346697377181959", "black", "黒", "0.983"]];
2761 
2762     string[][] data3x6ExpectedPermuteWt3 =
2763         [["field_a", "field_b", "field_c"],
2764          ["yellow", "黄", "12"],
2765          ["blue", "青", "12"],
2766          ["red", "赤", "23.8"],
2767          ["black", "黒", "0.983"],
2768          ["white", "白", "1.65"],
2769          ["green", "緑", "0.0072"]];
2770 
2771 
2772     string[][] data3x6ExpectedReplaceNum10 =
2773         [["field_a", "field_b", "field_c"],
2774          ["black", "黒", "0.983"],
2775          ["green", "緑", "0.0072"],
2776          ["green", "緑", "0.0072"],
2777          ["red", "赤", "23.8"],
2778          ["yellow", "黄", "12"],
2779          ["red", "赤", "23.8"],
2780          ["white", "白", "1.65"],
2781          ["yellow", "黄", "12"],
2782          ["yellow", "黄", "12"],
2783          ["white", "白", "1.65"],
2784         ];
2785 
2786     string[][] data3x6ExpectedReplaceNum10V77 =
2787         [["field_a", "field_b", "field_c"],
2788          ["black", "黒", "0.983"],
2789          ["red", "赤", "23.8"],
2790          ["black", "黒", "0.983"],
2791          ["yellow", "黄", "12"],
2792          ["green", "緑", "0.0072"],
2793          ["green", "緑", "0.0072"],
2794          ["green", "緑", "0.0072"],
2795          ["yellow", "黄", "12"],
2796          ["blue", "青", "12"],
2797          ["white", "白", "1.65"],
2798         ];
2799 
2800     /* Using a different static seed. */
2801     string[][] data3x6ExpectedPermuteCompatV41Probs =
2802         [["random_value", "field_a", "field_b", "field_c"],
2803          ["0.68057272653095424", "green", "緑", "0.0072"],
2804          ["0.67681624367833138", "blue", "青", "12"],
2805          ["0.32097338931635022", "yellow", "黄", "12"],
2806          ["0.25092361867427826", "red", "赤", "23.8"],
2807          ["0.15535934292711318", "black", "黒", "0.983"],
2808          ["0.046095821075141430", "white", "白", "1.65"]];
2809 
2810     string[][] data3x6ExpectedBernoulliCompatP60V41Probs =
2811         [["random_value", "field_a", "field_b", "field_c"],
2812          ["0.25092361867427826", "red", "赤", "23.8"],
2813          ["0.046095821075141430", "white", "白", "1.65"],
2814          ["0.32097338931635022", "yellow", "黄", "12"],
2815          ["0.15535934292711318", "black", "黒", "0.983"]];
2816 
2817     string[][] data3x6ExpectedPermuteWt3V41Probs =
2818         [["random_value", "field_a", "field_b", "field_c"],
2819          ["0.96799377498910666", "blue", "青", "12"],
2820          ["0.94356245792573568", "red", "赤", "23.8"],
2821          ["0.90964601024271996", "yellow", "黄", "12"],
2822          ["0.15491658409260103", "white", "白", "1.65"],
2823          ["0.15043620392537033", "black", "黒", "0.983"],
2824          ["6.1394674830701461e-24", "green", "緑", "0.0072"]];
2825 
2826     string[][] data3x6ExpectedWt3V41ProbsInorder =
2827         [["random_value", "field_a", "field_b", "field_c"],
2828          ["0.94356245792573568", "red", "赤", "23.8"],
2829          ["6.1394674830701461e-24", "green", "緑", "0.0072"],
2830          ["0.15491658409260103", "white", "白", "1.65"],
2831          ["0.90964601024271996", "yellow", "黄", "12"],
2832          ["0.96799377498910666", "blue", "青", "12"],
2833          ["0.15043620392537033", "black", "黒", "0.983"]];
2834 
2835 
2836     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
2837     string[][] combo1ExpectedPermuteCompat =
2838         [["field_a", "field_b", "field_c"],
2839          ["yellow", "黄", "12"],
2840          ["tan", "タン", "8.5"],
2841          ["brown", "褐色", "29.2"],
2842          ["green", "緑", "0.0072"],
2843          ["red", "赤", "23.8"],
2844          ["purple", "紫の", "42"],
2845          ["black", "黒", "0.983"],
2846          ["white", "白", "1.65"],
2847          ["gray", "グレー", "6.2"],
2848          ["blue", "青", "12"],
2849          ["pink", "ピンク", "1.1"],
2850          ["orange", "オレンジ", "2.5"]];
2851 
2852     string[][] combo1ExpectedPermuteCompatProbs =
2853         [["random_value", "field_a", "field_b", "field_c"],
2854          ["0.97088520275428891", "yellow", "黄", "12"],
2855          ["0.96055546286515892", "tan", "タン", "8.5"],
2856          ["0.81756894313730299", "brown", "褐色", "29.2"],
2857          ["0.75710153928957880", "green", "緑", "0.0072"],
2858          ["0.52525980887003243", "red", "赤", "23.8"],
2859          ["0.49287854949943721", "purple", "紫の", "42"],
2860          ["0.47081507067196071", "black", "黒", "0.983"],
2861          ["0.38388182921335101", "white", "白", "1.65"],
2862          ["0.29215990612283349", "gray", "グレー", "6.2"],
2863          ["0.24033216014504433", "blue", "青", "12"],
2864          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2865          ["0.010968807619065046", "orange", "オレンジ", "2.5"]];
2866 
2867     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
2868     string[][] combo1ExpectedProbsInorder =
2869         [["random_value", "field_a", "field_b", "field_c"],
2870          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
2871          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2872          ["0.49287854949943721", "purple", "紫の", "42"],
2873          ["0.96055546286515892", "tan", "タン", "8.5"],
2874          ["0.52525980887003243", "red", "赤", "23.8"],
2875          ["0.75710153928957880", "green", "緑", "0.0072"],
2876          ["0.38388182921335101", "white", "白", "1.65"],
2877          ["0.97088520275428891", "yellow", "黄", "12"],
2878          ["0.24033216014504433", "blue", "青", "12"],
2879          ["0.47081507067196071", "black", "黒", "0.983"],
2880          ["0.81756894313730299", "brown", "褐色", "29.2"],
2881          ["0.29215990612283349", "gray", "グレー", "6.2"]];
2882 
2883     string[][] combo1ExpectedBernoulliCompatP50Probs =
2884         [["random_value", "field_a", "field_b", "field_c"],
2885          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
2886          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2887          ["0.49287854949943721", "purple", "紫の", "42"],
2888          ["0.38388182921335101", "white", "白", "1.65"],
2889          ["0.24033216014504433", "blue", "青", "12"],
2890          ["0.47081507067196071", "black", "黒", "0.983"],
2891          ["0.29215990612283349", "gray", "グレー", "6.2"]];
2892 
2893     string[][] combo1ExpectedBernoulliCompatP40 =
2894         [["field_a", "field_b", "field_c"],
2895          ["orange", "オレンジ", "2.5"],
2896          ["pink", "ピンク", "1.1"],
2897          ["white", "白", "1.65"],
2898          ["blue", "青", "12"],
2899          ["gray", "グレー", "6.2"]];
2900 
2901     string[][] combo1ExpectedDistinctK1P40 =
2902         [["field_a", "field_b", "field_c"],
2903          ["orange", "オレンジ", "2.5"],
2904          ["red", "赤", "23.8"],
2905          ["green", "緑", "0.0072"],
2906          ["blue", "青", "12"],
2907          ["black", "黒", "0.983"]];
2908 
2909     string[][] combo1ExpectedPermuteWt3Probs =
2910         [["random_value", "field_a", "field_b", "field_c"],
2911          ["0.99754077523718754", "yellow", "黄", "12"],
2912          ["0.99527665440088786", "tan", "タン", "8.5"],
2913          ["0.99312578945741659", "brown", "褐色", "29.2"],
2914          ["0.98329602553389361", "purple", "紫の", "42"],
2915          ["0.97330961938083660", "red", "赤", "23.8"],
2916          ["0.88797551521739648", "blue", "青", "12"],
2917          ["0.81999230489041786", "gray", "グレー", "6.2"],
2918          ["0.55975569204250941", "white", "白", "1.65"],
2919          ["0.46472135609205739", "black", "黒", "0.983"],
2920          ["0.18824582704191337", "pink", "ピンク", "1.1"],
2921          ["0.16446131853299920", "orange", "オレンジ", "2.5"],
2922          ["1.6438086931020549e-17", "green", "緑", "0.0072"]];
2923 
2924     string[][] combo1ExpectedPermuteWt3 =
2925         [["field_a", "field_b", "field_c"],
2926          ["yellow", "黄", "12"],
2927          ["tan", "タン", "8.5"],
2928          ["brown", "褐色", "29.2"],
2929          ["purple", "紫の", "42"],
2930          ["red", "赤", "23.8"],
2931          ["blue", "青", "12"],
2932          ["gray", "グレー", "6.2"],
2933          ["white", "白", "1.65"],
2934          ["black", "黒", "0.983"],
2935          ["pink", "ピンク", "1.1"],
2936          ["orange", "オレンジ", "2.5"],
2937          ["green", "緑", "0.0072"]];
2938 
2939         string[][] combo1ExpectedSampleAlgoRNum4 =
2940         [["field_a", "field_b", "field_c"],
2941          ["blue", "青", "12"],
2942          ["gray", "グレー", "6.2"],
2943          ["brown", "褐色", "29.2"],
2944          ["white", "白", "1.65"]];
2945 
2946         string[][] combo1ExpectedSampleAlgoRNum4Inorder =
2947         [["field_a", "field_b", "field_c"],
2948          ["white", "白", "1.65"],
2949          ["blue", "青", "12"],
2950          ["brown", "褐色", "29.2"],
2951          ["gray", "グレー", "6.2"]];
2952 
2953     string[][] combo1ExpectedReplaceNum10 =
2954         [["field_a", "field_b", "field_c"],
2955          ["gray", "グレー", "6.2"],
2956          ["yellow", "黄", "12"],
2957          ["yellow", "黄", "12"],
2958          ["white", "白", "1.65"],
2959          ["tan", "タン", "8.5"],
2960          ["white", "白", "1.65"],
2961          ["blue", "青", "12"],
2962          ["black", "黒", "0.983"],
2963          ["tan", "タン", "8.5"],
2964          ["purple", "紫の", "42"]];
2965 
2966     /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */
2967     string[][] data1x200 =
2968         [["field_a"],
2969          ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"],
2970          ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"],
2971          ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"],
2972          ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"],
2973          ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"],
2974          ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"],
2975          ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"],
2976          ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"],
2977          ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"],
2978          ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"],
2979          ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"],
2980          ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"],
2981          ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"],
2982          ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"],
2983          ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"],
2984          ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"],
2985          ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"],
2986          ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"],
2987          ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"],
2988          ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"],
2989         ];
2990 
2991     string fpath_data1x200 = buildPath(testDir, "data1x200.tsv");
2992     string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv");
2993     writeUnittestTsvFile(fpath_data1x200, data1x200);
2994     writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1 .. $]);
2995 
2996     string[][] data1x200ExpectedBernoulliSkipV333P01 =
2997         [["field_a"],
2998          ["077"],
2999          ["119"]];
3000 
3001     string[][] data1x200ExpectedBernoulliSkipV333P02 =
3002         [["field_a"],
3003          ["038"],
3004          ["059"],
3005          ["124"],
3006          ["161"],
3007          ["162"],
3008          ["183"]];
3009 
3010     string[][] data1x200ExpectedBernoulliSkipV333P03 =
3011         [["field_a"],
3012          ["025"],
3013          ["039"],
3014          ["082"],
3015          ["107"],
3016          ["108"],
3017          ["122"],
3018          ["136"],
3019          ["166"],
3020          ["182"]];
3021 
3022     string[][] data1x200ExpectedBernoulliCompatV333P01 =
3023         [["field_a"],
3024          ["072"]];
3025 
3026     string[][] data1x200ExpectedBernoulliCompatV333P02 =
3027         [["field_a"],
3028          ["004"],
3029          ["072"]];
3030 
3031     string[][] data1x200ExpectedBernoulliCompatV333P03 =
3032         [["field_a"],
3033          ["004"],
3034          ["072"],
3035          ["181"]];
3036 
3037     /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files,
3038      * only expected results. The header is from 3x0, the results are offset 1-position
3039      * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line.
3040      */
3041     string[][] combo2ExpectedBernoulliSkipV333P03 =
3042         [["field_a", "field_b", "field_c"],
3043          ["024"],
3044          ["038"],
3045          ["081"],
3046          ["106"],
3047          ["107"],
3048          ["121"],
3049          ["135"],
3050          ["165"],
3051          ["181"]];
3052 
3053 
3054     /* 1x10 - Simple 1-column file. */
3055     string[][] data1x10 =
3056         [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]];
3057     string fpath_data1x10 = buildPath(testDir, "data1x10.tsv");
3058     string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv");
3059     writeUnittestTsvFile(fpath_data1x10, data1x10);
3060     writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1 .. $]);
3061 
3062     string[][] data1x10ExpectedPermuteCompat =
3063         [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]];
3064 
3065     string[][] data1x10ExpectedPermuteWt1 =
3066         [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]];
3067 
3068     /* 2x10a - Uniform distribution [0,1]. */
3069     string[][] data2x10a =
3070         [["line", "weight"],
3071          ["1", "0.26788837"],
3072          ["2", "0.06601298"],
3073          ["3", "0.38627527"],
3074          ["4", "0.47379424"],
3075          ["5", "0.02966641"],
3076          ["6", "0.05636231"],
3077          ["7", "0.70529242"],
3078          ["8", "0.91836862"],
3079          ["9", "0.99103720"],
3080          ["10", "0.31401740"]];
3081 
3082     string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv");
3083     writeUnittestTsvFile(fpath_data2x10a, data2x10a);
3084 
3085     string[][] data2x10aExpectedPermuteWt2Probs =
3086         [["random_value", "line", "weight"],
3087          ["0.96833865494543658", "8", "0.91836862"],
3088          ["0.91856842054413923", "4", "0.47379424"],
3089          ["0.25730832087795091", "7", "0.70529242"],
3090          ["0.23725317907018120", "9", "0.99103720"],
3091          ["0.16016096701872204", "3", "0.38627527"],
3092          ["0.090819662667243381", "10", "0.31401740"],
3093          ["0.0071764539244361172", "6", "0.05636231"],
3094          ["0.000000048318642951630057", "1", "0.26788837"],
3095          ["0.00000000037525692966535517", "5", "0.02966641"],
3096          ["8.2123247880095796e-13", "2", "0.06601298"]];
3097 
3098     /* 2x10b - Uniform distribution [0,1000]. */
3099     string[][] data2x10b =
3100         [["line", "weight"],
3101          ["1", "761"],
3102          ["2", "432"],
3103          ["3", "103"],
3104          ["4", "448"],
3105          ["5", "750"],
3106          ["6", "711"],
3107          ["7", "867"],
3108          ["8", "841"],
3109          ["9", "963"],
3110          ["10", "784"]];
3111 
3112     string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv");
3113     writeUnittestTsvFile(fpath_data2x10b, data2x10b);
3114 
3115     string[][] data2x10bExpectedPermuteWt2Probs =
3116         [["random_value", "line", "weight"],
3117          ["0.99996486739067969", "8", "841"],
3118          ["0.99991017467137211", "4", "448"],
3119          ["0.99960871524873662", "6", "711"],
3120          ["0.99914188537143800", "5", "750"],
3121          ["0.99903963250274785", "10", "784"],
3122          ["0.99889631825931946", "7", "867"],
3123          ["0.99852058315191139", "9", "963"],
3124          ["0.99575669679158918", "2", "432"],
3125          ["0.99408758732050595", "1", "761"],
3126          ["0.99315467761212362", "3", "103"]];
3127 
3128     /* 2x10c - Logarithmic distribution in random order. */
3129     string[][] data2x10c =
3130         [["line", "weight"],
3131          ["1", "31.85"],
3132          ["2", "17403.31"],
3133          ["3", "653.84"],
3134          ["4", "8.23"],
3135          ["5", "2671.04"],
3136          ["6", "26226.08"],
3137          ["7", "1.79"],
3138          ["8", "354.56"],
3139          ["9", "35213.81"],
3140          ["10", "679.29"]];
3141 
3142     string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv");
3143     writeUnittestTsvFile(fpath_data2x10c, data2x10c);
3144 
3145     string[][] data2x10cExpectedPermuteWt2Probs =
3146         [["random_value", "line", "weight"],
3147          ["0.99998939008709697", "6", "26226.08"],
3148          ["0.99995951291695517", "9", "35213.81"],
3149          ["0.99991666907613541", "8", "354.56"],
3150          ["0.99989445052186410", "2", "17403.31"],
3151          ["0.99975897602861630", "5", "2671.04"],
3152          ["0.99891852769877643", "3", "653.84"],
3153          ["0.99889167752782515", "10", "679.29"],
3154          ["0.99512207506850148", "4", "8.23"],
3155          ["0.86789371584259023", "1", "31.85"],
3156          ["0.58574438162915610", "7", "1.79"]];
3157 
3158     /* 2x10d. Logarithmic distribution in ascending order. */
3159     string[][] data2x10d =
3160         [["line", "weight"],
3161          ["1", "1.79"],
3162          ["2", "8.23"],
3163          ["3", "31.85"],
3164          ["4", "354.56"],
3165          ["5", "653.84"],
3166          ["6", "679.29"],
3167          ["7", "2671.04"],
3168          ["8", "17403.31"],
3169          ["9", "26226.08"],
3170          ["10", "35213.81"]];
3171 
3172     string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv");
3173     writeUnittestTsvFile(fpath_data2x10d, data2x10d);
3174 
3175     string[][] data2x10dExpectedPermuteWt2Probs =
3176         [["random_value", "line", "weight"],
3177          ["0.99999830221846353", "8", "17403.31"],
3178          ["0.99997860834041397", "10", "35213.81"],
3179          ["0.99994563828986716", "9", "26226.08"],
3180          ["0.99988650363575737", "4", "354.56"],
3181          ["0.99964161939190088", "7", "2671.04"],
3182          ["0.99959045338948649", "6", "679.29"],
3183          ["0.99901574490639788", "5", "653.84"],
3184          ["0.97803163304747431", "3", "31.85"],
3185          ["0.79994791806910948", "2", "8.23"],
3186          ["0.080374261239949119", "1", "1.79"]];
3187 
3188     /* 2x10e. Logarithmic distribution in descending order. */
3189     string[][] data2x10e =
3190         [["line", "weight"],
3191          ["1", "35213.81"],
3192          ["2", "26226.08"],
3193          ["3", "17403.31"],
3194          ["4", "2671.04"],
3195          ["5", "679.29"],
3196          ["6", "653.84"],
3197          ["7", "354.56"],
3198          ["8", "31.85"],
3199          ["9", "8.23"],
3200          ["10", "1.79"]];
3201     string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv");
3202     writeUnittestTsvFile(fpath_data2x10e, data2x10e);
3203 
3204     string[][] data2x10eExpectedPermuteWt2Probs =
3205         [["random_value", "line", "weight"],
3206          ["0.99998493348975237", "4", "2671.04"],
3207          ["0.99995934807202624", "3", "17403.31"],
3208          ["0.99992995739727453", "2", "26226.08"],
3209          ["0.99987185679245649", "1", "35213.81"],
3210          ["0.99957451563173938", "6", "653.84"],
3211          ["0.99907273650209583", "8", "31.85"],
3212          ["0.99905260312968946", "5", "679.29"],
3213          ["0.99730333650516401", "7", "354.56"],
3214          ["0.84093902435227808", "9", "8.23"],
3215          ["0.65650015926290028", "10", "1.79"]];
3216 
3217     /* Data sets for distinct sampling. */
3218     string[][] data5x25 =
3219         [["ID", "Shape", "Color", "Size", "Weight"],
3220          ["01", "circle", "red", "S", "10"],
3221          ["02", "circle", "black", "L", "20"],
3222          ["03", "square", "black", "L", "20"],
3223          ["04", "circle", "green", "L", "30"],
3224          ["05", "ellipse", "red", "S", "20"],
3225          ["06", "triangle", "red", "S", "10"],
3226          ["07", "triangle", "red", "L", "20"],
3227          ["08", "square", "black", "S", "10"],
3228          ["09", "circle", "black", "S", "20"],
3229          ["10", "square", "green", "L", "20"],
3230          ["11", "triangle", "red", "L", "20"],
3231          ["12", "circle", "green", "L", "30"],
3232          ["13", "ellipse", "red", "S", "20"],
3233          ["14", "circle", "green", "L", "30"],
3234          ["15", "ellipse", "red", "L", "30"],
3235          ["16", "square", "red", "S", "10"],
3236          ["17", "circle", "black", "L", "20"],
3237          ["18", "square", "red", "S", "20"],
3238          ["19", "square", "black", "L", "20"],
3239          ["20", "circle", "red", "S", "10"],
3240          ["21", "ellipse", "black", "L", "30"],
3241          ["22", "triangle", "red", "L", "30"],
3242          ["23", "circle", "green", "S", "20"],
3243          ["24", "square", "green", "L", "20"],
3244          ["25", "circle", "red", "S", "10"],
3245         ];
3246 
3247     string fpath_data5x25 = buildPath(testDir, "data5x25.tsv");
3248     string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv");
3249     writeUnittestTsvFile(fpath_data5x25, data5x25);
3250     writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1 .. $]);
3251 
3252     string[][] data5x25ExpectedDistinctK2P40 =
3253         [["ID", "Shape", "Color", "Size", "Weight"],
3254          ["03", "square", "black", "L", "20"],
3255          ["05", "ellipse", "red", "S", "20"],
3256          ["08", "square", "black", "S", "10"],
3257          ["10", "square", "green", "L", "20"],
3258          ["13", "ellipse", "red", "S", "20"],
3259          ["15", "ellipse", "red", "L", "30"],
3260          ["16", "square", "red", "S", "10"],
3261          ["18", "square", "red", "S", "20"],
3262          ["19", "square", "black", "L", "20"],
3263          ["21", "ellipse", "black", "L", "30"],
3264          ["24", "square", "green", "L", "20"],
3265         ];
3266 
3267     string[][] data5x25ExpectedDistinctK2K4P20 =
3268         [["ID", "Shape", "Color", "Size", "Weight"],
3269          ["03", "square", "black", "L", "20"],
3270          ["07", "triangle", "red", "L", "20"],
3271          ["08", "square", "black", "S", "10"],
3272          ["10", "square", "green", "L", "20"],
3273          ["11", "triangle", "red", "L", "20"],
3274          ["16", "square", "red", "S", "10"],
3275          ["18", "square", "red", "S", "20"],
3276          ["19", "square", "black", "L", "20"],
3277          ["22", "triangle", "red", "L", "30"],
3278          ["24", "square", "green", "L", "20"],
3279         ];
3280 
3281     string[][] data5x25ExpectedDistinctK2K3K4P20 =
3282         [["ID", "Shape", "Color", "Size", "Weight"],
3283          ["04", "circle", "green", "L", "30"],
3284          ["07", "triangle", "red", "L", "20"],
3285          ["09", "circle", "black", "S", "20"],
3286          ["11", "triangle", "red", "L", "20"],
3287          ["12", "circle", "green", "L", "30"],
3288          ["14", "circle", "green", "L", "30"],
3289          ["16", "square", "red", "S", "10"],
3290          ["18", "square", "red", "S", "20"],
3291          ["22", "triangle", "red", "L", "30"],
3292         ];
3293 
3294     /* Fields 2 and 4 from data5x25. Distinct rows should be the same for equiv keys. */
3295     string[][] data2x25 =
3296         [["Shape", "Size"],
3297          ["circle", "S"],
3298          ["circle", "L"],
3299          ["square", "L"],
3300          ["circle", "L"],
3301          ["ellipse", "S"],
3302          ["triangle", "S"],
3303          ["triangle", "L"],
3304          ["square", "S"],
3305          ["circle", "S"],
3306          ["square", "L"],
3307          ["triangle", "L"],
3308          ["circle", "L"],
3309          ["ellipse", "S"],
3310          ["circle", "L"],
3311          ["ellipse", "L"],
3312          ["square", "S"],
3313          ["circle", "L"],
3314          ["square", "S"],
3315          ["square", "L"],
3316          ["circle", "S"],
3317          ["ellipse", "L"],
3318          ["triangle", "L"],
3319          ["circle", "S"],
3320          ["square", "L"],
3321          ["circle", "S"],
3322         ];
3323 
3324     string fpath_data2x25 = buildPath(testDir, "data2x25.tsv");
3325     string fpath_data2x25_noheader = buildPath(testDir, "data2x25_noheader.tsv");
3326     writeUnittestTsvFile(fpath_data2x25, data2x25);
3327     writeUnittestTsvFile(fpath_data2x25_noheader, data2x25[1 .. $]);
3328 
3329     string[][] data2x25ExpectedDistinctK1K2P20 =
3330         [["Shape", "Size"],
3331          ["square", "L"],
3332          ["triangle", "L"],
3333          ["square", "S"],
3334          ["square", "L"],
3335          ["triangle", "L"],
3336          ["square", "S"],
3337          ["square", "S"],
3338          ["square", "L"],
3339          ["triangle", "L"],
3340          ["square", "L"],
3341         ];
3342 
3343     string[][] data1x25 =
3344         [["Shape-Size"],
3345          ["circle-S"],
3346          ["circle-L"],
3347          ["square-L"],
3348          ["circle-L"],
3349          ["ellipse-S"],
3350          ["triangle-S"],
3351          ["triangle-L"],
3352          ["square-S"],
3353          ["circle-S"],
3354          ["square-L"],
3355          ["triangle-L"],
3356          ["circle-L"],
3357          ["ellipse-S"],
3358          ["circle-L"],
3359          ["ellipse-L"],
3360          ["square-S"],
3361          ["circle-L"],
3362          ["square-S"],
3363          ["square-L"],
3364          ["circle-S"],
3365          ["ellipse-L"],
3366          ["triangle-L"],
3367          ["circle-S"],
3368          ["square-L"],
3369          ["circle-S"],
3370         ];
3371 
3372     string fpath_data1x25 = buildPath(testDir, "data1x25.tsv");
3373     string fpath_data1x25_noheader = buildPath(testDir, "data1x25_noheader.tsv");
3374     writeUnittestTsvFile(fpath_data1x25, data1x25);
3375     writeUnittestTsvFile(fpath_data1x25_noheader, data1x25[1 .. $]);
3376 
3377     string[][] data1x25ExpectedDistinctK1P20 =
3378         [["Shape-Size"],
3379          ["triangle-L"],
3380          ["square-S"],
3381          ["triangle-L"],
3382          ["ellipse-L"],
3383          ["square-S"],
3384          ["square-S"],
3385          ["ellipse-L"],
3386          ["triangle-L"],
3387         ];
3388 
3389     string[][] data1x25ExpectedDistinctK1P20Probs =
3390         [["random_value", "Shape-Size"],
3391          ["0", "triangle-L"],
3392          ["0", "square-S"],
3393          ["0", "triangle-L"],
3394          ["0", "ellipse-L"],
3395          ["0", "square-S"],
3396          ["0", "square-S"],
3397          ["0", "ellipse-L"],
3398          ["0", "triangle-L"],
3399         ];
3400 
3401     string[][] data1x25ExpectedDistinctK1P20ProbsInorder =
3402         [["random_value", "Shape-Size"],
3403          ["1", "circle-S"],
3404          ["4", "circle-L"],
3405          ["2", "square-L"],
3406          ["4", "circle-L"],
3407          ["2", "ellipse-S"],
3408          ["1", "triangle-S"],
3409          ["0", "triangle-L"],
3410          ["0", "square-S"],
3411          ["1", "circle-S"],
3412          ["2", "square-L"],
3413          ["0", "triangle-L"],
3414          ["4", "circle-L"],
3415          ["2", "ellipse-S"],
3416          ["4", "circle-L"],
3417          ["0", "ellipse-L"],
3418          ["0", "square-S"],
3419          ["4", "circle-L"],
3420          ["0", "square-S"],
3421          ["2", "square-L"],
3422          ["1", "circle-S"],
3423          ["0", "ellipse-L"],
3424          ["0", "triangle-L"],
3425          ["1", "circle-S"],
3426          ["2", "square-L"],
3427          ["1", "circle-S"],
3428         ];
3429 
3430     /*
3431      * Enough setup! Actually run some tests!
3432      */
3433 
3434     /* Shuffling tests. Headers, static seed, compatibility mode. With weights and without. */
3435     testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty);
3436     testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0);
3437     testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1);
3438     testTsvSample(["test-a4", "-H", "-s", "--compatibility-mode", fpath_data3x2], data3x2PermuteCompat);
3439     testTsvSample(["test-a5", "-H", "-s", "--compatibility-mode", fpath_data3x3], data3x3ExpectedPermuteCompat);
3440     testTsvSample(["test-a6", "-H", "-s", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat);
3441     testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
3442     testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3);
3443     testTsvSample(["test-a8b", "-H", "-s", "--weight-field", "field_c", fpath_data3x6], data3x6ExpectedPermuteWt3);
3444     testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
3445     testTsvSample(["test-a9b", "-H", "-s", "--print-random", "-w", "field_c", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
3446     testTsvSample(["test-a9c", "-H", "-s", "--print-random", "-w", "f*c", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
3447     testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
3448     testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
3449     testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
3450     testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs);
3451     testTsvSample(["test-a13b", "-H", "-v", "41", "-w", "field_c", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs);
3452     testTsvSample(["test-a13c", "--line-buffered", "-H", "-v", "41", "-w", "field_c", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs);
3453 
3454     /* Shuffling, without compatibility mode, or with both compatibility and printing. */
3455     testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty);
3456     testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0);
3457     testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1);
3458     testTsvSample(["test-aa4", "-H", "-s", fpath_data3x2], data3x2PermuteShuffle);
3459     testTsvSample(["test-aa5", "-H", "-s", fpath_data3x3], data3x3ExpectedPermuteSwap);
3460     testTsvSample(["test-aa6", "-H", "-s", fpath_data3x6], data3x6ExpectedPermuteSwap);
3461     testTsvSample(["test-aa7", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3);
3462     testTsvSample(["test-aa8", "-H", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
3463     testTsvSample(["test-aa8b", "-H", "-s", "--print-random", "-w", "field_c", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
3464     testTsvSample(["test-aa9", "-H", "--seed-value", "41", "--print-random", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
3465 
3466     /* Reservoir sampling using Algorithm R.
3467      * (Note: reservoirSamplingViaHeap is tested later in the length-based iteration loops.)
3468      */
3469     testTsvSample(["test-aa10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty);
3470     testTsvSample(["test-aa11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty);
3471     testTsvSample(["test-aa12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x0], data3x0);
3472     testTsvSample(["test-aa13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x0], data3x0);
3473     testTsvSample(["test-aa14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x1], data3x1);
3474     testTsvSample(["test-aa15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x1], data3x1);
3475     testTsvSample(["test-aa16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6);
3476     testTsvSample(["test-aa17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6);
3477     testTsvSample(["test-aa18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5);
3478     testTsvSample(["test-aa19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4);
3479     testTsvSample(["test-aa20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3);
3480     testTsvSample(["test-aa21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2);
3481     testTsvSample(["test-aa22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1);
3482     testTsvSample(["test-aa22b", "--line-buffered", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1);
3483 
3484     /* Inorder versions of Algorithm R tests. */
3485     testTsvSample(["test-ai10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty);
3486     testTsvSample(["test-ai11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty);
3487     testTsvSample(["test-ai12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0);
3488     testTsvSample(["test-ai13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0);
3489     testTsvSample(["test-ai14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1);
3490     testTsvSample(["test-ai15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1);
3491     testTsvSample(["test-ai16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder);
3492     testTsvSample(["test-ai17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder);
3493     testTsvSample(["test-ai18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5Inorder);
3494     testTsvSample(["test-ai19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4Inorder);
3495     testTsvSample(["test-ai20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3Inorder);
3496     testTsvSample(["test-ai21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2Inorder);
3497     testTsvSample(["test-ai22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1Inorder);
3498 
3499     /* Bernoulli sampling cases. */
3500     testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty);
3501     testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0);
3502     testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1);
3503     testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6);
3504     testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6);
3505     testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
3506     testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60);
3507     testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliCompatP60);
3508     testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs);
3509     testTsvSample(["test-a22b", "--line-buffered", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs);
3510 
3511     /* Bernoulli sampling with probabilities in skip sampling range or preferring skip sampling. */
3512     testTsvSample(["test-ab1", "-H", "--seed-value", "333", "--prob", "0.01", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P01);
3513     testTsvSample(["test-ab2", "-H", "--seed-value", "333", "--prob", "0.02", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P02);
3514     testTsvSample(["test-ab3", "-H", "--seed-value", "333", "--prob", "0.03", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03);
3515     testTsvSample(["test-ab4", "-H", "--seed-value", "333", "--prob", "0.01", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P01);
3516     testTsvSample(["test-ab5", "-H", "--seed-value", "333", "--prob", "0.02", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P02);
3517     testTsvSample(["test-ab6", "-H", "--seed-value", "333", "--prob", "0.03", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P03);
3518     testTsvSample(["test-ab7", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40);
3519     testTsvSample(["test-ab7b", "--line-buffered", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40);
3520 
3521     /* Distinct sampling cases. */
3522     testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty);
3523     testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0);
3524     testTsvSample(["test-a24b", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "field_a", fpath_data3x0], data3x0);
3525     testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1);
3526     testTsvSample(["test-a25b", "-H", "-s", "-p", "1.0", "-k", "field_b", fpath_data3x1], data3x1);
3527     testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6);
3528     testTsvSample(["test-a26b", "-H", "-s", "-p", "1.0", "-k", "field_b", fpath_data3x6], data3x6);
3529     testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctK1K3P60);
3530     testTsvSample(["test-a27b", "-H", "-s", "-p", "0.6", "-k", "field_a,field_c", fpath_data3x6], data3x6ExpectedDistinctK1K3P60);
3531     testTsvSample(["test-a27c", "--line-buffered", "-H", "-s", "-p", "0.6", "-k", "field_a,field_c", fpath_data3x6], data3x6ExpectedDistinctK1K3P60);
3532 
3533     /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling.
3534      * For weighted sampling, use the weighted cases, but with expected using the original ordering.
3535      */
3536     testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
3537     testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
3538     testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
3539                   data3x6ExpectedWt3ProbsInorder);
3540     testTsvSample(["test-a30b", "-H", "-s", "--gen-random-inorder", "--weight-field", "field_c", fpath_data3x6],
3541                   data3x6ExpectedWt3ProbsInorder);
3542     testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
3543                   data3x6ExpectedWt3V41ProbsInorder);
3544     testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6],
3545                   data3x6ExpectedDistinctK1K3P60Probs);
3546     testTsvSample(["test-a32b", "-H", "-s", "-p", "0.6", "-k", "field_a,field_c", "--print-random", fpath_data3x6],
3547                   data3x6ExpectedDistinctK1K3P60Probs);
3548     testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header",
3549                    "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctK1K3P60ProbsRVCustom);
3550     testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6],
3551                   data3x6ExpectedDistinctK2P2ProbsInorder);
3552     testTsvSample(["test-a34b", "--line-buffered", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6],
3553                   data3x6ExpectedDistinctK2P2ProbsInorder);
3554 
3555     /* Simple random sampling with replacement. */
3556     testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty);
3557     testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty);
3558     testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0);
3559     testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0);
3560     testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplaceNum3);
3561     testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10);
3562     testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77);
3563     testTsvSample(["test-a41b", "--line-buffered", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77);
3564 
3565     /* Shuffling, compatibility mode, without headers. */
3566     testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1 .. $]);
3567     testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1 .. $]);
3568     testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1 .. $]);
3569     testTsvSample(["test-b4", "-s", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1 .. $]);
3570     testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1 .. $]);
3571     testTsvSample(["test-b6", "-s", "--weight-field", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]);
3572     testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]);
3573     testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]);
3574     testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1 .. $]);
3575     testTsvSample(["test-b9b", "--line-buffered", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1 .. $]);
3576 
3577     /* Shuffling, no headers, without compatibility mode, or with printing and compatibility mode. */
3578     testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1 .. $]);
3579     testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1 .. $]);
3580     testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1 .. $]);
3581     testTsvSample(["test-bb4", "-s", fpath_data3x6_noheader], data3x6ExpectedPermuteSwap[1 .. $]);
3582     testTsvSample(["test-bb5", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]);
3583     testTsvSample(["test-bb6", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]);
3584     testTsvSample(["test-bb7", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]);
3585     testTsvSample(["test-bb7b", "--line-buffered", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]);
3586 
3587     /* Reservoir sampling using Algorithm R, no headers. */
3588     testTsvSample(["test-ac10", "--prefer-algorithm-r", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty);
3589     testTsvSample(["test-ac11", "--prefer-algorithm-r", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty);
3590     testTsvSample(["test-ac14", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x1_noheader], data3x1[1 .. $]);
3591     testTsvSample(["test-ac15", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x1_noheader], data3x1[1 .. $]);
3592     testTsvSample(["test-ac16", "--prefer-algorithm-r", "-s", "--num", "7", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]);
3593     testTsvSample(["test-ac17", "--prefer-algorithm-r", "-s", "--num", "6", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]);
3594     testTsvSample(["test-ac18", "--prefer-algorithm-r", "-s", "--num", "5", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5[1 .. $]);
3595     testTsvSample(["test-ac19", "--prefer-algorithm-r", "-s", "--num", "4", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4[1 .. $]);
3596     testTsvSample(["test-ac20", "--prefer-algorithm-r", "-s", "--num", "3", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3[1 .. $]);
3597     testTsvSample(["test-ac21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2[1 .. $]);
3598     testTsvSample(["test-ac22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1[1 .. $]);
3599     testTsvSample(["test-ac22b", "--line-buffered", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1[1 .. $]);
3600 
3601     /* Reservoir sampling using Algorithm R, no headers, inorder output. */
3602     testTsvSample(["test-aj10", "--prefer-algorithm-r", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty);
3603     testTsvSample(["test-aj11", "--prefer-algorithm-r", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty);
3604     testTsvSample(["test-aj14", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3605     testTsvSample(["test-aj15", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3606     testTsvSample(["test-aj16", "--prefer-algorithm-r", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]);
3607     testTsvSample(["test-aj17", "--prefer-algorithm-r", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]);
3608     testTsvSample(["test-aj18", "--prefer-algorithm-r", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5Inorder[1 .. $]);
3609     testTsvSample(["test-aj19", "--prefer-algorithm-r", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4Inorder[1 .. $]);
3610     testTsvSample(["test-aj20", "--prefer-algorithm-r", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3Inorder[1 .. $]);
3611     testTsvSample(["test-aj21", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2Inorder[1 .. $]);
3612     testTsvSample(["test-aj22", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1Inorder[1 .. $]);
3613     testTsvSample(["test-aj22b", "--line-buffered", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1Inorder[1 .. $]);
3614 
3615     /* Bernoulli sampling cases. */
3616     testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1 .. $]);
3617     testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]);
3618     testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]);
3619     testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]);
3620     testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1 .. $]);
3621     testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1 .. $]);
3622     testTsvSample(["test-b15b", "--line-buffered", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1 .. $]);
3623 
3624     /* Bernoulli sampling with probabilities in skip sampling range. */
3625     testTsvSample(["test-bb1", "-v", "333", "-p", "0.01", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P01[1 .. $]);
3626     testTsvSample(["test-bb2", "-v", "333", "-p", "0.02", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P02[1 .. $]);
3627     testTsvSample(["test-bb3", "-v", "333", "-p", "0.03", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1 .. $]);
3628     testTsvSample(["test-bb4", "-v", "333", "-p", "0.01", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P01[1 .. $]);
3629     testTsvSample(["test-bb5", "-v", "333", "-p", "0.02", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P02[1 .. $]);
3630     testTsvSample(["test-bb6", "-v", "333", "-p", "0.03", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P03[1 .. $]);
3631     testTsvSample(["test-bb7", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1 .. $]);
3632     testTsvSample(["test-bb7b", "--line-buffered", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1 .. $]);
3633 
3634     /* Distinct sampling cases. */
3635     testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1 .. $]);
3636     testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]);
3637     testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]);
3638     testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]);
3639     testTsvSample(["test-b19b", "--line-buffered", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]);
3640 
3641     /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */
3642     testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]);
3643     testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1 .. $]);
3644     testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader],
3645                   data3x6ExpectedDistinctK1K3P60Probs[1 .. $]);
3646     testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader],
3647                   data3x6ExpectedDistinctK2P2ProbsInorder[1 .. $]);
3648     testTsvSample(["test-b24b", "--line-buffered", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader],
3649                   data3x6ExpectedDistinctK2P2ProbsInorder[1 .. $]);
3650 
3651     /* Simple random sampling with replacement. */
3652     testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty);
3653     testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty);
3654     testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplaceNum3[1 .. $]);
3655     testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10[1 .. $]);
3656     testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1 .. $]);
3657     testTsvSample(["test-b29b", "--line-buffered", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1 .. $]);
3658 
3659     /* Multi-file tests. */
3660     testTsvSample(["test-c1", "--header", "--static-seed", "--compatibility-mode",
3661                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3662                   combo1ExpectedPermuteCompat);
3663     testTsvSample(["test-c2", "--header", "--static-seed", "--print-random",
3664                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3665                   combo1ExpectedPermuteCompatProbs);
3666     testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3",
3667                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3668                   combo1ExpectedPermuteWt3Probs);
3669     testTsvSample(["test-c3b", "--header", "--static-seed", "--print-random", "--weight-field", "field_c",
3670                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3671                   combo1ExpectedPermuteWt3Probs);
3672     testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", "--compatibility-mode",
3673                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3674                   combo1ExpectedPermuteWt3);
3675     testTsvSample(["test-c5", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4",
3676                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3677                   combo1ExpectedSampleAlgoRNum4);
3678     testTsvSample(["test-c5b", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder",
3679                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3680                   combo1ExpectedSampleAlgoRNum4Inorder);
3681 
3682     /* Multi-file, no headers. */
3683     testTsvSample(["test-c6", "--static-seed", "--compatibility-mode",
3684                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3685                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3686                   combo1ExpectedPermuteCompat[1 .. $]);
3687     testTsvSample(["test-c7", "--static-seed", "--print-random",
3688                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3689                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3690                   combo1ExpectedPermuteCompatProbs[1 .. $]);
3691     testTsvSample(["test-c8", "--static-seed", "--print-random", "--weight-field", "3",
3692                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3693                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3694                   combo1ExpectedPermuteWt3Probs[1 .. $]);
3695     testTsvSample(["test-c9", "--static-seed", "--weight-field", "3", "--compatibility-mode",
3696                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3697                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3698                   combo1ExpectedPermuteWt3[1 .. $]);
3699     testTsvSample(["test-c10", "--static-seed", "--prefer-algorithm-r", "--num", "4",
3700                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3701                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3702                   combo1ExpectedSampleAlgoRNum4[1 .. $]);
3703     testTsvSample(["test-c10b", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder",
3704                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3705                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3706                   combo1ExpectedSampleAlgoRNum4Inorder[1 .. $]);
3707 
3708     /* Bernoulli sampling cases. */
3709     testTsvSample(["test-c11", "--header", "--static-seed", "--print-random", "--prob", ".5",
3710                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3711                   combo1ExpectedBernoulliCompatP50Probs);
3712     testTsvSample(["test-c12", "--header", "--static-seed", "--prob", ".4",
3713                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3714                   combo1ExpectedBernoulliCompatP40);
3715     testTsvSample(["test-c13", "--static-seed", "--print-random", "--prob", ".5",
3716                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3717                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3718                   combo1ExpectedBernoulliCompatP50Probs[1 .. $]);
3719     testTsvSample(["test-c14", "--static-seed", "--prob", ".4",
3720                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3721                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3722                   combo1ExpectedBernoulliCompatP40[1 .. $]);
3723     testTsvSample(["test-c14b", "--line-buffered", "--static-seed", "--prob", ".4",
3724                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3725                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3726                   combo1ExpectedBernoulliCompatP40[1 .. $]);
3727 
3728     /* Bernoulli sampling with probabilities in skip sampling range. */
3729     testTsvSample(["test-cc1", "-H", "-v", "333", "-p", "0.03",
3730                    fpath_data3x0, fpath_data3x1, fpath_data1x200, fpath_dataEmpty, fpath_data1x10],
3731                   combo2ExpectedBernoulliSkipV333P03);
3732     testTsvSample(["test-cc2", "-v", "333", "-p", "0.03",
3733                    fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader],
3734                   combo2ExpectedBernoulliSkipV333P03[1 .. $]);
3735     testTsvSample(["test-cc3", "--line-buffered", "-v", "333", "-p", "0.03",
3736                    fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader],
3737                   combo2ExpectedBernoulliSkipV333P03[1 .. $]);
3738 
3739     /* Distinct sampling cases. */
3740     testTsvSample(["test-c15", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4",
3741                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3742                   combo1ExpectedDistinctK1P40);
3743     testTsvSample(["test-c15b", "--header", "--static-seed", "--key-fields", "field_a", "--prob", ".4",
3744                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3745                   combo1ExpectedDistinctK1P40);
3746     testTsvSample(["test-c15c", "--line-buffered", "--header", "--static-seed", "--key-fields", "field_a", "--prob", ".4",
3747                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3748                   combo1ExpectedDistinctK1P40);
3749     testTsvSample(["test-c16", "--static-seed", "--key-fields", "1", "--prob", ".4",
3750                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3751                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3752                   combo1ExpectedDistinctK1P40[1 .. $]);
3753     testTsvSample(["test-c16b", "--line-buffered", "--static-seed", "--key-fields", "1", "--prob", ".4",
3754                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3755                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3756                   combo1ExpectedDistinctK1P40[1 .. $]);
3757 
3758     /* Generating random weights. */
3759     testTsvSample(["test-c17", "--header", "--static-seed", "--gen-random-inorder",
3760                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3761                   combo1ExpectedProbsInorder);
3762     testTsvSample(["test-c18", "--static-seed", "--gen-random-inorder",
3763                    fpath_data3x3_noheader, fpath_data3x1_noheader,
3764                    fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader],
3765                   combo1ExpectedProbsInorder[1 .. $]);
3766 
3767     /* Simple random sampling with replacement. */
3768     testTsvSample(["test-c19", "--header", "--static-seed", "--replace", "--num", "10",
3769                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3770                   combo1ExpectedReplaceNum10);
3771 
3772     testTsvSample(["test-c20", "--static-seed", "--replace", "--num", "10",
3773                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3774                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3775                   combo1ExpectedReplaceNum10[1 .. $]);
3776 
3777     /* Single column file. */
3778     testTsvSample(["test-d1", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat);
3779     testTsvSample(["test-d2", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat);
3780 
3781     /* Distributions. */
3782     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs);
3783     testTsvSample(["test-e1b", "-H", "-s", "-w", "weight", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs);
3784     testTsvSample(["test-e2", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedPermuteWt2Probs);
3785     testTsvSample(["test-e3", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedPermuteWt2Probs);
3786     testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs);
3787     testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs);
3788 
3789     /* Tests of subset sample (--n|num) field. Random sampling, Bernoulli sampling, distinct sampling.
3790      *
3791      * Note: The way these tests are done ensures that subset length does not affect
3792      * output order.
3793      */
3794     import std.algorithm : min;
3795     for (size_t n = data3x6.length + 2; n >= 1; n--)
3796     {
3797         /* reservoirSamplingViaHeap.
3798          */
3799         size_t expectedLength = min(data3x6.length, n + 1);
3800         testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string,
3801                        "-H", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]);
3802 
3803         testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string,
3804                        "-H", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]);
3805 
3806         testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string,
3807                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs[0..expectedLength]);
3808 
3809         testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string,
3810                        "-H", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3[0..expectedLength]);
3811 
3812         testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string,
3813                        "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs[0..expectedLength]);
3814 
3815         testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string,
3816                        fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..expectedLength]);
3817 
3818         testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string,
3819                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..expectedLength]);
3820 
3821         testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string,
3822                        "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..expectedLength]);
3823 
3824         testTsvSample([format("test-f9_%d", n), "-s", "-n", n.to!string,
3825                        "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..expectedLength]);
3826 
3827         /* Bernoulli sampling.
3828          */
3829         import std.algorithm : min;
3830         size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedBernoulliCompatProbsP60.length);
3831 
3832         testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3833                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60[0..sampleExpectedLength]);
3834 
3835         testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3836                        "-H", fpath_data3x6], data3x6ExpectedBernoulliCompatP60[0..sampleExpectedLength]);
3837 
3838         testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3839                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..sampleExpectedLength]);
3840 
3841         testTsvSample([format("test-f13_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3842                        fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60[1..sampleExpectedLength]);
3843 
3844         /* Distinct Sampling.
3845          */
3846         size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctK1K3P60.length);
3847 
3848         testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
3849                        "-H", fpath_data3x6], data3x6ExpectedDistinctK1K3P60[0..distinctExpectedLength]);
3850 
3851         testTsvSample([format("test-f15_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
3852                        fpath_data3x6_noheader], data3x6ExpectedDistinctK1K3P60[1..distinctExpectedLength]);
3853 
3854         testTsvSample([format("test-f16_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
3855                        "-H", fpath_data3x6], data3x6ExpectedBernoulliProbsP100[0..expectedLength]);
3856 
3857         testTsvSample([format("test-f17_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
3858                        fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]);
3859     }
3860 
3861     /* Similar tests with the 1x10 data set. */
3862     for (size_t n = data1x10.length + 2; n >= 1; n--)
3863     {
3864         size_t expectedLength = min(data1x10.length, n + 1);
3865         testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string,
3866                        "-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]);
3867 
3868         testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string,
3869                        "-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]);
3870 
3871         testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string,
3872                        fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]);
3873 
3874         testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string,
3875                        "-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]);
3876     }
3877 
3878     /* Simple random sampling with replacement: ensure sample size doesn't change order. */
3879     for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--)
3880     {
3881         testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6],
3882                       data3x6ExpectedReplaceNum10[0 .. n + 1]);
3883 
3884         testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader],
3885                       data3x6ExpectedReplaceNum10[1 .. n + 1]);
3886     }
3887 
3888     /* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */
3889     for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--)
3890     {
3891         size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1);
3892 
3893         testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
3894                        "-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]);
3895 
3896         testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
3897                        fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]);
3898     }
3899 
3900     /* Inorder sampling tests using reservoir sampling via heap (compatibility mode). */
3901     testTsvSample(["test-ar10", "--compatibility-mode", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty);
3902     testTsvSample(["test-ar11", "--compatibility-mode", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty);
3903     testTsvSample(["test-ar12", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0);
3904     testTsvSample(["test-ar13", "--compatibility-mode", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0);
3905     testTsvSample(["test-ar14", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1);
3906     testTsvSample(["test-ar15", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1);
3907     testTsvSample(["test-ar16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder);
3908     testTsvSample(["test-ar17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder);
3909     testTsvSample(["test-ar18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum5Inorder);
3910     testTsvSample(["test-ar19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", fpath_data3x6],         data3x6ExpectedSampleCompatNum4Inorder);
3911     testTsvSample(["test-ar20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum3Inorder);
3912     testTsvSample(["test-ar21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum2Inorder);
3913     testTsvSample(["test-ar22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum1Inorder);
3914 
3915     testTsvSample(["test-as10", "--compatibility-mode", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty);
3916     testTsvSample(["test-as11", "--compatibility-mode", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty);
3917     testTsvSample(["test-as14", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3918     testTsvSample(["test-as15", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3919     testTsvSample(["test-as16", "--compatibility-mode", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]);
3920     testTsvSample(["test-as17", "--compatibility-mode", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]);
3921     testTsvSample(["test-as18", "--compatibility-mode", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5Inorder[1 .. $]);
3922     testTsvSample(["test-as19", "--compatibility-mode", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4Inorder[1 .. $]);
3923     testTsvSample(["test-as20", "--compatibility-mode", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3Inorder[1 .. $]);
3924     testTsvSample(["test-as21", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2Inorder[1 .. $]);
3925     testTsvSample(["test-as22", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1Inorder[1 .. $]);
3926 
3927     /* Inorder sampling tests with random number printing. --compatibility-mode not needed. */
3928     testTsvSample(["test-at16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder);
3929     testTsvSample(["test-at17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder);
3930     testTsvSample(["test-at18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum5ProbsInorder);
3931     testTsvSample(["test-at19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder);
3932     testTsvSample(["test-at19",                         "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder);
3933     testTsvSample(["test-at20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder);
3934     testTsvSample(["test-at20",                         "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder);
3935     testTsvSample(["test-at21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum2ProbsInorder);
3936     testTsvSample(["test-at22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum1ProbsInorder);
3937 
3938     testTsvSample(["test-au16", "--compatibility-mode", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]);
3939     testTsvSample(["test-au17", "--compatibility-mode", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]);
3940     testTsvSample(["test-au18", "--compatibility-mode", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5ProbsInorder[1 .. $]);
3941     testTsvSample(["test-au19", "--compatibility-mode", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]);
3942     testTsvSample(["test-au19",                         "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]);
3943     testTsvSample(["test-au20", "--compatibility-mode", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3ProbsInorder[1 .. $]);
3944     testTsvSample(["test-au21", "--compatibility-mode", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2ProbsInorder[1 .. $]);
3945     testTsvSample(["test-au22", "--compatibility-mode", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1ProbsInorder[1 .. $]);
3946 
3947     /* Inorder weighted sampling tests. */
3948     testTsvSample(["test-ax16", "-H", "-s", "-n", "7", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder);
3949     testTsvSample(["test-ax17", "-H", "-s", "-n", "6", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder);
3950     testTsvSample(["test-ax18", "-H", "-s", "-n", "5", "-i", fpath_data3x6], data3x6ExpectedWt3Num5Inorder);
3951     testTsvSample(["test-ax19", "-H", "-s", "-n", "4", "-i", fpath_data3x6], data3x6ExpectedWt3Num4Inorder);
3952     testTsvSample(["test-ax20", "-H", "-s", "-n", "3", "-i", fpath_data3x6], data3x6ExpectedWt3Num3Inorder);
3953     testTsvSample(["test-ax21", "-H", "-s", "-n", "2", "-i", fpath_data3x6], data3x6ExpectedWt3Num2Inorder);
3954     testTsvSample(["test-ax22", "-H", "-s", "-n", "1", "-i", fpath_data3x6], data3x6ExpectedWt3Num1Inorder);
3955 
3956     testTsvSample(["test-ay16", "-s", "-n", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]);
3957     testTsvSample(["test-ay17", "-s", "-n", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]);
3958     testTsvSample(["test-ay18", "-s", "-n", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num5Inorder[1 .. $]);
3959     testTsvSample(["test-ay19", "-s", "-n", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num4Inorder[1 .. $]);
3960     testTsvSample(["test-ay20", "-s", "-n", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num3Inorder[1 .. $]);
3961     testTsvSample(["test-ay21", "-s", "-n", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num2Inorder[1 .. $]);
3962     testTsvSample(["test-ay22", "-s", "-n", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num1Inorder[1 .. $]);
3963 
3964     /*
3965      * Distinct sampling tests.
3966      */
3967     testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25],
3968                   data5x25ExpectedDistinctK2P40);
3969 
3970     testTsvSample(["test-j1b", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "Shape", fpath_data5x25],
3971                   data5x25ExpectedDistinctK2P40);
3972 
3973     testTsvSample(["test-j2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25],
3974                   data5x25ExpectedDistinctK2K4P20);
3975 
3976     testTsvSample(["test-j2b", "-H", "-s", "-p", "0.20", "-k", "Shape,Size", fpath_data5x25],
3977                   data5x25ExpectedDistinctK2K4P20);
3978 
3979     testTsvSample(["test-j3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25],
3980                   data5x25ExpectedDistinctK2K3K4P20);
3981 
3982     testTsvSample(["test-j3b", "-H", "-s", "-p", "0.20", "-k", "Shape-Size", fpath_data5x25],
3983                   data5x25ExpectedDistinctK2K3K4P20);
3984 
3985     testTsvSample(["test-j4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader],
3986                   data5x25ExpectedDistinctK2P40[1 .. $]);
3987 
3988     testTsvSample(["test-j5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader],
3989                   data5x25ExpectedDistinctK2K4P20[1 .. $]);
3990 
3991     testTsvSample(["test-j6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader],
3992                   data5x25ExpectedDistinctK2K3K4P20[1 .. $]);
3993 
3994 
3995     /* These distinct tests check that the whole line as '-k 0' and specifying all fields
3996      * in order have the same result. Also that field numbers don't matter, as '-k 1,2'
3997      * in data2x25 are the same keys as '-k 2,4' in data5x25.
3998      */
3999     testTsvSample(["test-j7", "-H", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25],
4000                   data2x25ExpectedDistinctK1K2P20);
4001 
4002     testTsvSample(["test-j8", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data2x25],
4003                   data2x25ExpectedDistinctK1K2P20);
4004 
4005     testTsvSample(["test-j8b", "-H", "-s", "-p", "0.20", "-k", "*", fpath_data2x25],
4006                   data2x25ExpectedDistinctK1K2P20);
4007 
4008     testTsvSample(["test-j9", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25_noheader],
4009                   data2x25ExpectedDistinctK1K2P20[1 .. $]);
4010 
4011     testTsvSample(["test-j10", "-s", "-p", "0.20", "-k", "0", fpath_data2x25_noheader],
4012                   data2x25ExpectedDistinctK1K2P20[1 .. $]);
4013 
4014     /* Similar to the last set, but for a 1-column file. Also with random value printing. */
4015     testTsvSample(["test-j11", "-H", "-s", "-p", "0.20", "-k", "1", fpath_data1x25],
4016                   data1x25ExpectedDistinctK1P20);
4017 
4018     testTsvSample(["test-j12", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data1x25],
4019                   data1x25ExpectedDistinctK1P20);
4020 
4021     testTsvSample(["test-j12b", "-H", "-s", "-p", "0.20", "-k", "*", fpath_data1x25],
4022                   data1x25ExpectedDistinctK1P20);
4023 
4024     testTsvSample(["test-j13", "-s", "-p", "0.20", "-k", "1", fpath_data1x25_noheader],
4025                   data1x25ExpectedDistinctK1P20[1 .. $]);
4026 
4027     testTsvSample(["test-j14", "-s", "-p", "0.20", "-k", "0", fpath_data1x25_noheader],
4028                   data1x25ExpectedDistinctK1P20[1 .. $]);
4029 
4030     testTsvSample(["test-j15", "-H", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25],
4031                   data1x25ExpectedDistinctK1P20Probs);
4032 
4033     testTsvSample(["test-j15b", "-H", "-s", "-p", "0.20", "-k", `Shape\-Size`, "--print-random", fpath_data1x25],
4034                   data1x25ExpectedDistinctK1P20Probs);
4035 
4036     testTsvSample(["test-j16", "-H", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25],
4037                   data1x25ExpectedDistinctK1P20Probs);
4038 
4039     testTsvSample(["test-j16b", "-H", "-s", "-p", "0.20", "-k", "*", "--print-random", fpath_data1x25],
4040                   data1x25ExpectedDistinctK1P20Probs);
4041 
4042     testTsvSample(["test-j17", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25_noheader],
4043                   data1x25ExpectedDistinctK1P20Probs[1 .. $]);
4044 
4045     testTsvSample(["test-j18", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25_noheader],
4046                   data1x25ExpectedDistinctK1P20Probs[1 .. $]);
4047 
4048     testTsvSample(["test-j19", "-H", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25],
4049                   data1x25ExpectedDistinctK1P20ProbsInorder);
4050 
4051     testTsvSample(["test-j19b", "-H", "-s", "-p", "0.20", "-k", `Shape\-Size`, "--gen-random-inorder", fpath_data1x25],
4052                   data1x25ExpectedDistinctK1P20ProbsInorder);
4053 
4054     testTsvSample(["test-j20", "-H", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25],
4055                   data1x25ExpectedDistinctK1P20ProbsInorder);
4056 
4057     testTsvSample(["test-j20b", "-H", "-s", "-p", "0.20", "-k", "*", "--gen-random-inorder", fpath_data1x25],
4058                   data1x25ExpectedDistinctK1P20ProbsInorder);
4059 
4060     testTsvSample(["test-j21", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25_noheader],
4061                   data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]);
4062 
4063     testTsvSample(["test-j22", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25_noheader],
4064                   data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]);
4065 
4066 }