1 /**
2 Command line tool for shuffling or sampling lines from input streams. Several methods
3 are available, including weighted and unweighted shuffling, simple and weighted random
4 sampling, sampling with replacement, Bernoulli sampling, and distinct sampling.
5 
6 Copyright (c) 2017-2020, eBay Inc.
7 Initially written by Jon Degenhardt
8 
9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
10 */
11 module tsv_utils.tsv_sample;
12 
13 import std.array : appender, Appender, RefAppender;
14 import std.exception : enforce;
15 import std.format : format;
16 import std.range;
17 import std.stdio;
18 import std.typecons : tuple, Flag;
19 
20 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
21 
22 version(unittest)
23 {
24     // When running unit tests, use main from -main compiler switch.
25 }
26 else
27 {
28     /** Main program.
29      *
30      * Invokes command line argument processing and calls tsvSample to do the real
31      * work. Errors occurring during processing are caught and reported to the user.
32      */
33     int main(string[] cmdArgs)
34     {
35         /* When running in DMD code coverage mode, turn on report merging. */
36         version(D_Coverage) version(DigitalMars)
37         {
38             import core.runtime : dmd_coverSetMerge;
39             dmd_coverSetMerge(true);
40         }
41 
42         TsvSampleOptions cmdopt;
43         const r = cmdopt.processArgs(cmdArgs);
44         if (!r[0]) return r[1];
45         version(LDC_Profile)
46         {
47             import ldc.profile : resetAll;
48             resetAll();
49         }
50         try
51         {
52             import tsv_utils.common.utils : BufferedOutputRange;
53             auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
54 
55             tsvSample(cmdopt, bufferedOutput);
56         }
57         catch (Exception exc)
58         {
59             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
60             return 1;
61         }
62         return 0;
63     }
64 }
65 
66 immutable helpText = q"EOS
67 Synopsis: tsv-sample [options] [file...]
68 
69 Sample input lines or randomize their order. Several modes of operation
70 are available:
71 * Shuffling (the default): All input lines are output in random order. All
72   orderings are equally likely.
73 * Random sampling (--n|num N): A random sample of N lines are selected and
74   written to standard output. By default, selected lines are written in
75   random order. All sample sets and orderings are equally likely. Use
76   --i|inorder to write the selected lines in the original input order.
77 * Weighted random sampling (--n|num N, --w|weight-field F): A weighted
78   sample of N lines is produced. Weights are taken from field F. Lines are
79   output in weighted selection order. Use --i|inorder to write in original
80   input order. Omit --n|num to shuffle all lines (weighted shuffling).
81 * Sampling with replacement (--r|replace, --n|num N): All input lines are
82   read in, then lines are repeatedly selected at random and written out.
83   This continues until N lines are output. Individual lines can be written
84   multiple times. Output continues forever if N is zero or not provided.
85 * Bernoulli sampling (--p|prob P): A random subset of lines is selected
86   based on probability P, a 0.0-1.0 value. This is a streaming operation.
87   A decision is made on each line as it is read. Line order is not changed.
88 * Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled
89   based on the values in the key fields. A subset of keys are chosen based
90   on the inclusion probability (a 'distinct' set of keys). All lines with
91   one of the selected keys are output. Line order is not changed.
92 
93 Fields are specified using field number or field name. Field names require
94 that the input file has a header line.
95 
96 Use '--help-verbose' for detailed information.
97 
98 Options:
99 EOS";
100 
101 immutable helpTextVerbose = q"EOS
102 Synopsis: tsv-sample [options] [file...]
103 
104 Sample input lines or randomize their order. Several modes of operation
105 are available:
106 * Shuffling (the default): All input lines are output in random order. All
107   orderings are equally likely.
108 * Random sampling (--n|num N): A random sample of N lines are selected and
109   written to standard output. By default, selected lines are written in
110   random order. All sample sets and orderings are equally likely. Use
111   --i|inorder to write the selected lines in the original input order.
112 * Weighted random sampling (--n|num N, --w|weight-field F): A weighted
113   sample of N lines is produced. Weights are taken from field F. Lines are
114   output in weighted selection order. Use --i|inorder to write in original
115   input order. Omit --n|num to shuffle all lines (weighted shuffling).
116 * Sampling with replacement (--r|replace, --n|num N): All input lines are
117   read in, then lines are repeatedly selected at random and written out.
118   This continues until N lines are output. Individual lines can be written
119   multiple times. Output continues forever if N is zero or not provided.
120 * Bernoulli sampling (--p|prob P): A random subset of lines is selected
121   based on probability P, a 0.0-1.0 value. This is a streaming operation.
122   A decision is made on each line as it is read. Line order is not changed.
123 * Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled
124   based on the values in the key fields. A subset of keys are chosen based
125   on the inclusion probability (a 'distinct' set of keys). All lines with
126   one of the selected keys are output. Line order is not changed.
127 
128 Fields: Fields are specified by field number or name. Field names require
129 the input file to have a header line. Use '--help-fields' for details.
130 
131 Sample size: The '--n|num' option controls the sample size for all
132 sampling methods. In the case of simple and weighted random sampling it
133 also limits the amount of memory required.
134 
135 Controlling the random seed: By default, each run produces a different
136 randomization or sampling. Using '--s|static-seed' changes this so
137 multiple runs produce the same results. This works by using the same
138 random seed each run. The random seed can be specified using
139 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero
140 value is a no-op and ignored.)
141 
142 Memory use: Bernoulli sampling and distinct sampling make decisions on
143 each line as it is read, there is no memory accumulation. These algorithms
144 can run on arbitrary size inputs. Sampling with replacement reads all
145 lines into memory and is limited by available memory. Shuffling also reads
146 all lines into memory and is similarly limited. Random sampling uses
147 reservoir sampling, and only needs to hold the sample size (--n|num) in
148 memory. The input data can be of any length.
149 
150 Weighted sampling: Weighted random sampling is done using an algorithm
151 described by Pavlos Efraimidis and Paul Spirakis. Weights should be
152 positive values representing the relative weight of the entry in the
153 collection. Counts and similar can be used as weights, it is *not*
154 necessary to normalize to a [0,1] interval. Negative values are not
155 meaningful and given the value zero. Input order is not retained, instead
156 lines are output ordered by the randomized weight that was assigned. This
157 means that a smaller valid sample can be produced by taking the first N
158 lines of output. For more info on the sampling approach see:
159 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling
160 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis
161   (https://arxiv.org/abs/1012.0256)
162 
163 Printing random values: Most of the sampling algorithms work by generating
164 a random value for each line. (See "Compatibility mode" below.) The nature
165 of these values depends on the sampling algorithm. They are used for both
166 line selection and output ordering. The '--p|print-random' option can be
167 used to print these values. The random value is prepended to the line
168 separated by the --d|delimiter char (TAB by default). The
169 '--gen-random-inorder' option takes this one step further, generating
170 random values for all input lines without changing the input order. The
171 types of values currently used by these sampling algorithms:
172 * Unweighted sampling: Uniform random value in the interval [0,1]. This
173   includes Bernoulli sampling and unweighted line order randomization.
174 * Weighted sampling: Value in the interval [0,1]. Distribution depends on
175   the values in the weight field. It is used as a partial ordering.
176 * Distinct sampling: An integer, zero and up, representing a selection
177   group. The inclusion probability determines the number of selection groups.
178 * Sampling with replacement: Random value printing is not supported.
179 
180 The specifics behind these random values are subject to change in future
181 releases.
182 
183 Compatibility mode: As described above, many of the sampling algorithms
184 assign a random value to each line. This is useful when printing random
185 values. It has another occasionally useful property: repeated runs with
186 the same static seed but different selection parameters are more
187 compatible with each other, as each line gets assigned the same random
188 value on every run. For example, if Bernoulli sampling is run with
189 '--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed',
190 all the lines selected in the first run will be selected in the second.
191 This comes at a cost: in some cases there are faster algorithms that don't
192 preserve this property. By default, tsv-sample will use faster algorithms
193 when available. However, the '--compatibility-mode' option switches to
194 algorithms that assign a random value per line. Printing random values
195 also engages compatibility mode.
196 
197 Options:
198 EOS";
199 
200 /** Container for command line options and derived data.
201  *
202  * TsvSampleOptions handles several aspects of command line options. On the input side,
203  * it defines the command line options available, performs validation, and sets up any
204  * derived state based on the options provided. These activities are handled by the
205  * processArgs() member.
206  *
207  * Once argument processing is complete, TsvSampleOptions is used as a container
208  * holding the specific processing options used by the different sampling routines.
209  */
210 struct TsvSampleOptions
211 {
212     import tsv_utils.common.utils : InputSourceRange;
213 
214     string programName;                        /// Program name
215     InputSourceRange inputSources;             /// Input files
216     bool hasHeader = false;                    /// --H|header
217     ulong sampleSize = 0;                      /// --n|num - Size of the desired sample
218     double inclusionProbability = double.nan;  /// --p|prob - Inclusion probability
219     size_t[] keyFields;                        /// Derived: --k|key-fields - Used with inclusion probability
220     size_t weightField = 0;                    /// Derived: --w|weight-field - Field holding the weight
221     bool srsWithReplacement = false;           /// --r|replace
222     bool preserveInputOrder = false;           /// --i|inorder
223     bool staticSeed = false;                   /// --s|static-seed
224     uint seedValueOptionArg = 0;               /// --v|seed-value
225     bool printRandom = false;                  /// --print-random
226     bool genRandomInorder = false;             /// --gen-random-inorder
227     string randomValueHeader = "random_value"; /// --random-value-header
228     bool compatibilityMode = false;            /// --compatibility-mode
229     char delim = '\t';                         /// --d|delimiter
230     bool preferSkipSampling = false;           /// --prefer-skip-sampling
231     bool preferAlgorithmR = false;             /// --prefer-algorithm-r
232     bool hasWeightField = false;               /// Derived.
233     bool useBernoulliSampling = false;         /// Derived.
234     bool useDistinctSampling = false;          /// Derived.
235     bool distinctKeyIsFullLine = false;        /// Derived. True if '--k|key-fields 0' is specfied.
236     bool usingUnpredictableSeed = true;        /// Derived from --static-seed, --seed-value
237     uint seed = 0;                             /// Derived from --static-seed, --seed-value
238 
239     /** Process tsv-sample command line arguments.
240      *
241      * Defines the command line options, performs validation, and derives additional
242      * state. std.getopt.getopt is called to do the main option processing followed
243      * additional validation and derivation.
244      *
245      * Help text is printed to standard output if help was requested. Error text is
246      * written to stderr if invalid input is encountered.
247      *
248      * A tuple is returned. First value is true if command line arguments were
249      * successfully processed and execution should continue, or false if an error
250      * occurred or the user asked for help. If false, the second value is the
251      * appropriate exit code (0 or 1).
252      *
253      * Returning true (execution continues) means args have been validated and derived
254      * values calculated. Field indices will have been converted to zero-based.
255      */
256     auto processArgs(ref string[] cmdArgs)
257     {
258         import std.algorithm : all, canFind, each;
259         import std.conv : to;
260         import std.getopt;
261         import std.math : isNaN;
262         import std.path : baseName, stripExtension;
263         import std.typecons : Yes, No;
264         import tsv_utils.common.utils : inputSourceRange, ReadHeader, throwIfWindowsNewlineOnUnix;
265         import tsv_utils.common.fieldlist;
266 
267         bool helpVerbose = false;                  // --help-verbose
268         bool helpFields = false;                   // --help-fields
269         bool versionWanted = false;                // --V|version
270         string keyFieldsArg;                       // --k|key-fields
271         string weightFieldArg;                     // --w|weight-field
272 
273         string keyFieldsOptionString = "k|key-fields";
274         string weightFieldOptionString = "w|weight-field";
275 
276         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
277 
278         try
279         {
280             arraySep = ",";    // Use comma to separate values in command line options
281             auto r = getopt(
282                 cmdArgs,
283                 "help-verbose",    "     Print more detailed help.", &helpVerbose,
284                 "help-fields",     "     Print help on specifying fields.", &helpFields,
285 
286                 std.getopt.config.caseSensitive,
287                 "H|header",        "     Treat the first line of each file as a header.", &hasHeader,
288                 std.getopt.config.caseInsensitive,
289 
290                 "n|num",           "NUM  Maximum number of lines to output. All selected lines are output if not provided or zero.", &sampleSize,
291                 "p|prob",          "NUM  Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability,
292 
293                 keyFieldsOptionString,
294                 "<field-list>  Fields to use as key for distinct sampling. Use with '--p|prob'. Specify '--k|key-fields 0' to use the entire line as the key.",
295                 &keyFieldsArg,
296 
297                 weightFieldOptionString,
298                 "NUM  Field containing weights. All lines get equal weight if not provided.",
299                 &weightFieldArg,
300 
301                 "r|replace",       "     Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement,
302                 "i|inorder",       "     Output random samples in original input order. Requires use of --n|num.", &preserveInputOrder,
303                 "s|static-seed",   "     Use the same random seed every run.", &staticSeed,
304 
305                 std.getopt.config.caseSensitive,
306                 "v|seed-value",    "NUM  Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg,
307                 std.getopt.config.caseInsensitive,
308 
309                 "print-random",       "     Include the assigned random value (prepended) when writing output lines.", &printRandom,
310                 "gen-random-inorder", "     Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder,
311                 "random-value-header",  "     Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader,
312                 "compatibility-mode", "     Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode,
313 
314                 "d|delimiter",     "CHR  Field delimiter.", &delim,
315 
316                 std.getopt.config.caseSensitive,
317                 "V|version",       "     Print version information and exit.", &versionWanted,
318                 std.getopt.config.caseInsensitive,
319 
320                 "prefer-skip-sampling", "     (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.",
321                 &preferSkipSampling,
322 
323                 "prefer-algorithm-r",   "     (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.",
324                 &preferAlgorithmR,
325                 );
326 
327             if (r.helpWanted)
328             {
329                 defaultGetoptPrinter(helpText, r.options);
330                 return tuple(false, 0);
331             }
332             else if (helpVerbose)
333             {
334                 defaultGetoptPrinter(helpTextVerbose, r.options);
335                 return tuple(false, 0);
336             }
337             else if (helpFields)
338             {
339                 writeln(fieldListHelpText);
340                 return tuple(false, 0);
341             }
342             else if (versionWanted)
343             {
344                 import tsv_utils.common.tsvutils_version;
345                 writeln(tsvutilsVersionNotice("tsv-sample"));
346                 return tuple(false, 0);
347             }
348 
349             /* Input files. Remaining command line args are files. */
350             string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
351             cmdArgs.length = 1;
352 
353             /* Validation and derivations - Do as much validation prior to header line
354              * processing as possible (avoids waiting on stdin).
355              *
356              * Note: keyFields and weightField depend on header line processing, but
357              * keyFieldsArg and weightFieldArg can be used to detect whether the
358              * command line argument was specified.
359              */
360 
361             /* Set hasWeightField here so it can be used in other validation checks.
362              * Field validity checked after reading file header.
363              */
364             hasWeightField = !weightFieldArg.empty;
365 
366             /* Sampling with replacement checks (--r|replace). */
367             if (srsWithReplacement)
368             {
369                 enforce(!hasWeightField,
370                         "Sampling with replacement (--r|replace) does not support weights (--w|weight-field).");
371 
372                 enforce(inclusionProbability.isNaN,
373                         "Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob).");
374 
375                 enforce(keyFieldsArg.empty,
376                         "Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields).");
377 
378                 enforce(!printRandom && !genRandomInorder,
379                         "Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder).");
380 
381                 enforce(!preserveInputOrder,
382                         "Sampling with replacement (--r|replace) does not support input order preservation (--i|inorder option).");
383             }
384 
385             /* Distinct sampling checks (--k|key-fields --p|prob). */
386             enforce(keyFieldsArg.empty | !inclusionProbability.isNaN,
387                     "--p|prob is required when using --k|key-fields.");
388 
389             /* Inclusion probability (--p|prob) is used for both Bernoulli sampling
390              * and distinct sampling.
391              */
392             if (!inclusionProbability.isNaN)
393             {
394                 enforce(inclusionProbability > 0.0 && inclusionProbability <= 1.0,
395                         format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability));
396 
397                 if (!keyFieldsArg.empty) useDistinctSampling = true;
398                 else useBernoulliSampling = true;
399 
400                 enforce(!hasWeightField, "--w|weight-field and --p|prob cannot be used together.");
401 
402                 enforce(!genRandomInorder || useDistinctSampling,
403                         "--gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used." ~
404                         "\nUse --gen-random-inorder alone to print probabilities for all lines." ~
405                         "\nUse --p|prob and --print-random to print probabilities for lines satisfying the probability threshold.");
406             }
407             else if (genRandomInorder && !hasWeightField)
408             {
409                 useBernoulliSampling = true;
410             }
411 
412             /* randomValueHeader (--random-value-header) validity. Note that
413                randomValueHeader is initialized to a valid, non-empty string.
414             */
415             enforce(!randomValueHeader.empty && !randomValueHeader.canFind('\n') &&
416                     !randomValueHeader.canFind(delim),
417                     "--randomValueHeader must be at least one character and not contain field delimiters or newlines.");
418 
419             /* Check for incompatible use of (--i|inorder) and shuffling of the full
420              * data set. Sampling with replacement is also incompatible, this is
421              * detected earlier. Shuffling is the default operation, so it identified
422              * by eliminating the other modes of operation.
423              */
424             enforce(!preserveInputOrder ||
425                     sampleSize != 0 ||
426                     useBernoulliSampling ||
427                     useDistinctSampling,
428                     "Preserving input order (--i|inorder) is not compatible with full data set shuffling. Switch to random sampling with a sample size (--n|num) to use --i|inorder.");
429 
430 
431             /* Compatibility mode checks:
432              * - Random value printing implies compatibility-mode, otherwise user's
433              *   selection is used.
434              * - Distinct sampling doesn't support compatibility-mode. The routines
435              *   don't care, but users might expect larger probabilities to be a
436              *   superset of smaller probabilities. This would be confusing, so
437              *   flag it as an error.
438              */
439             enforce(!(compatibilityMode && useDistinctSampling),
440                     "Distinct sampling (--k|key-fields --p|prob) does not support --compatibility-mode.");
441 
442             if (printRandom || genRandomInorder) compatibilityMode = true;
443 
444 
445             /* Seed. */
446             import std.random : unpredictableSeed;
447 
448             usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0);
449 
450             if (usingUnpredictableSeed) seed = unpredictableSeed;
451             else if (seedValueOptionArg != 0) seed = seedValueOptionArg;
452             else if (staticSeed) seed = 2438424139;
453             else assert(0, "Internal error, invalid seed option states.");
454 
455             string[] headerFields;
456 
457             /* fieldListArgProcessing encapsulates the field list processing. It is
458              * called prior to reading the header line if headers are not being used,
459              * and after if headers are being used.
460              */
461             void fieldListArgProcessing()
462             {
463                 if (!weightFieldArg.empty)
464                 {
465                     auto fieldIndices =
466                         weightFieldArg
467                         .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero)
468                         (hasHeader, headerFields, weightFieldOptionString)
469                         .array;
470 
471                     enforce(fieldIndices.length == 1,
472                             format("'--%s' must be a single field.", weightFieldOptionString));
473 
474                     weightField = fieldIndices[0];
475                 }
476 
477                 if (!keyFieldsArg.empty)
478                 {
479                     keyFields =
480                         keyFieldsArg
481                         .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)
482                         (hasHeader, headerFields, keyFieldsOptionString)
483                         .array;
484 
485                     assert(keyFields.length > 0);
486 
487                     if (keyFields.length > 0)
488                     {
489                         if (keyFields.length == 1 && keyFields[0] == 0)
490                         {
491                             distinctKeyIsFullLine = true;
492                         }
493                         else
494                         {
495                             enforce(keyFields.length <= 1 || keyFields.all!(x => x != 0),
496                                     "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.");
497 
498                             keyFields.each!((ref x) => --x);  // Convert to zero-based indexing.
499                         }
500                     }
501                 }
502             }
503 
504             if (!hasHeader) fieldListArgProcessing();
505 
506             /*
507              * Create the inputSourceRange and perform header line processing.
508              */
509             ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
510             inputSources = inputSourceRange(filepaths, readHeader);
511 
512             if (hasHeader)
513             {
514                 throwIfWindowsNewlineOnUnix(inputSources.front.header, inputSources.front.name, 1);
515                 headerFields = inputSources.front.header.split(delim).to!(string[]);
516                 fieldListArgProcessing();
517             }
518 
519         }
520         catch (Exception exc)
521         {
522             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
523             return tuple(false, 1);
524         }
525         return tuple(true, 0);
526     }
527 }
528 /** Invokes the appropriate sampling routine based on the command line arguments.
529  *
530  * tsvSample is the top-level routine handling the different tsv-sample use cases.
531  * Its primary role is to invoke the correct routine for type of sampling requested.
532  */
533 void tsvSample(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
534 if (isOutputRange!(OutputRange, char))
535 {
536     if (cmdopt.srsWithReplacement)
537     {
538         simpleRandomSamplingWithReplacement(cmdopt, outputStream);
539     }
540     else if (cmdopt.useBernoulliSampling)
541     {
542         bernoulliSamplingCommand(cmdopt, outputStream);
543     }
544     else if (cmdopt.useDistinctSampling)
545     {
546         if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
547         else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream);
548     }
549     else if (cmdopt.genRandomInorder)
550     {
551         /* Note that the preceding cases handle gen-random-inorder themselves (Bernoulli,
552          * Distinct), or don't handle it (SRS w/ Replacement).
553          */
554         assert(cmdopt.hasWeightField);
555         generateWeightedRandomValuesInorder(cmdopt, outputStream);
556     }
557     else if (cmdopt.sampleSize != 0)
558     {
559         randomSamplingCommand(cmdopt, outputStream);
560     }
561     else
562     {
563         shuffleCommand(cmdopt, outputStream);
564     }
565 }
566 
567 /** Bernoulli sampling command handler. Invokes the appropriate Bernoulli sampling
568  * routine based on the command line arguments.
569  *
570  * This routine selects the appropriate Bernoulli sampling function and template
571  * instantiation to use based on the command line arguments.
572  *
573  * One of the basic choices is whether to use the vanilla algorithm or skip sampling.
574  * Skip sampling is a little bit faster when the inclusion probability is small but
575  * doesn't support compatibility mode. See the bernoulliSkipSampling documentation
576  * for a discussion of the skipSamplingProbabilityThreshold used here.
577  */
578 void bernoulliSamplingCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
579 if (isOutputRange!(OutputRange, char))
580 {
581     assert(!cmdopt.hasWeightField);
582 
583     immutable double skipSamplingProbabilityThreshold = 0.04;
584 
585     if (cmdopt.compatibilityMode ||
586         (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling))
587     {
588         if (cmdopt.genRandomInorder)
589         {
590             bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
591         }
592         else
593         {
594             bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream);
595         }
596     }
597     else
598     {
599         bernoulliSkipSampling(cmdopt, outputStream);
600     }
601 }
602 
603 /** Bernoulli sampling of lines from the input stream.
604  *
605  * Each input line is a assigned a random value and output if less than
606  * cmdopt.inclusionProbability. The order of the lines is not changed.
607  *
608  * This routine supports random value printing and gen-random-inorder value printing.
609  */
610 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
611     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
612 if (isOutputRange!(OutputRange, char))
613 {
614     import std.random : Random = Mt19937, uniform01;
615     import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange,
616         InputSourceRange, throwIfWindowsNewlineOnUnix;
617 
618     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
619     else assert(!cmdopt.genRandomInorder);
620 
621     assert(!cmdopt.inputSources.empty);
622     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
623 
624     auto randomGenerator = Random(cmdopt.seed);
625 
626     /* First header is read during command line argument processing. */
627     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
628     {
629         auto inputStream = cmdopt.inputSources.front;
630 
631         static if (generateRandomAll)
632         {
633             outputStream.put(cmdopt.randomValueHeader);
634             outputStream.put(cmdopt.delim);
635         }
636         else if (cmdopt.printRandom)
637         {
638             outputStream.put(cmdopt.randomValueHeader);
639             outputStream.put(cmdopt.delim);
640         }
641 
642         outputStream.put(inputStream.header);
643         outputStream.put("\n");
644 
645         /* Immediately flush the header so subsequent processes in a unix command
646          * pipeline see it early. This helps provide timely error messages.
647          */
648         static if (isFlushableOutputRange!OutputRange) outputStream.flush;
649     }
650 
651     /* Process each line. */
652     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
653     ulong numLinesWritten = 0;
654 
655     foreach (inputStream; cmdopt.inputSources)
656     {
657         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
658 
659         foreach (ulong fileLineNum, line;
660                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
661         {
662             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
663 
664             immutable double lineScore = uniform01(randomGenerator);
665 
666             static if (generateRandomAll)
667             {
668                 outputStream.formatRandomValue(lineScore);
669                 outputStream.put(cmdopt.delim);
670                 outputStream.put(line);
671                 outputStream.put("\n");
672 
673                 if (cmdopt.sampleSize != 0)
674                 {
675                     ++numLinesWritten;
676                     if (numLinesWritten == cmdopt.sampleSize) return;
677                 }
678             }
679             else if (lineScore < cmdopt.inclusionProbability)
680             {
681                 if (cmdopt.printRandom)
682                 {
683                     outputStream.formatRandomValue(lineScore);
684                     outputStream.put(cmdopt.delim);
685                 }
686                 outputStream.put(line);
687                 outputStream.put("\n");
688 
689                 if (cmdopt.sampleSize != 0)
690                 {
691                     ++numLinesWritten;
692                     if (numLinesWritten == cmdopt.sampleSize) return;
693                 }
694             }
695         }
696     }
697 }
698 
699 /** bernoulliSkipSampling is an implementation of Bernoulli sampling using skips.
700  *
701  * Skip sampling works by skipping a random number of lines between selections. This
702  * can be faster than assigning a random value to each line when the inclusion
703  * probability is low, as it reduces the number of calls to the random number
704  * generator. Both the random number generator and the log() function are called when
705  * calculating the next skip size. These additional log() calls add up as the
706  * inclusion probability increases.
707  *
708  * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for
709  * file-oriented line sampling. This is obviously environment specific. In the
710  * environments this implementation has been tested in the performance improvements
711  * remain small, less than 7%, even with an inclusion probability as low as 0.0001.
712  *
713  * The algorithm does not assign random values to individual lines. This makes it
714  * incompatible with random value printing. It is not suitable for compatibility mode
715  * either. As an example, in compatibility mode a line selected with '--prob 0.2' should
716  * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling
717  * does not have this property.
718  *
719  * The algorithm for calculating the skip size has been described by multiple sources.
720  * There are two key variants depending on whether the total number of lines in the
721  * data set is known in advance. (This implementation does not know the total.)
722  * Useful references:
723  * $(LIST
724  *     * Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling",
725  *       ACM Trans on Mathematical Software, 1987. On-line:
726  *       http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf
727  *     * P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book
728  *       "Data Stream Management", Springer-Verlag, 2016. On-line:
729  *       https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf
730  *     * Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line:
731  *       http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/
732  * )
733  */
734 void bernoulliSkipSampling(OutputRange)(ref TsvSampleOptions cmdopt, OutputRange outputStream)
735     if (isOutputRange!(OutputRange, char))
736 {
737     import std.conv : to;
738     import std.math : log, trunc;
739     import std.random : Random = Mt19937, uniform01;
740     import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange,
741         InputSourceRange, throwIfWindowsNewlineOnUnix;
742 
743     assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0);
744     assert(!cmdopt.printRandom);
745     assert(!cmdopt.compatibilityMode);
746 
747     assert(!cmdopt.inputSources.empty);
748     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
749 
750     auto randomGenerator = Random(cmdopt.seed);
751 
752     immutable double discardRate = 1.0 - cmdopt.inclusionProbability;
753     immutable double logDiscardRate = log(discardRate);
754 
755     /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed
756      * interval to (0.0, 1.0], excluding 0.0.
757      */
758     size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
759 
760     /* First header is read during command line argument processing. */
761     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
762     {
763         auto inputStream = cmdopt.inputSources.front;
764 
765         outputStream.put(inputStream.header);
766         outputStream.put("\n");
767 
768         /* Immediately flush the header so subsequent processes in a unix command
769          * pipeline see it early. This helps provide timely error messages.
770          */
771         static if (isFlushableOutputRange!OutputRange) outputStream.flush;
772     }
773 
774     /* Process each line. */
775     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
776     ulong numLinesWritten = 0;
777     foreach (inputStream; cmdopt.inputSources)
778     {
779         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
780 
781         foreach (ulong fileLineNum, line;
782                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
783         {
784             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
785 
786             if (remainingSkips > 0)
787             {
788                 --remainingSkips;
789             }
790             else
791             {
792                 outputStream.put(line);
793                 outputStream.put("\n");
794 
795                 if (cmdopt.sampleSize != 0)
796                 {
797                     ++numLinesWritten;
798                     if (numLinesWritten == cmdopt.sampleSize) return;
799                 }
800 
801                 remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t;
802             }
803         }
804     }
805 }
806 
807 /** Sample lines by choosing a random set of distinct keys formed from one or more
808  * fields on each line.
809  *
810  * Distinct sampling is a streaming form of sampling, similar to Bernoulli sampling.
811  * However, instead of each line being subject to an independent trial, lines are
812  * selected based on a key from each line. A portion of keys are randomly selected for
813  * output, and every line containing a selected key is included in the output.
814  *
815  * An example use-case is a query log having <user, query, clicked-url> triples. It is
816  * often useful to sample records for portion of the users, but including all records
817  * for the users selected. Distinct sampling supports this by selecting a subset of
818  * users to include in the output.
819  *
820  * Distinct sampling is done by hashing the key and mapping the hash value into
821  * buckets sized to hold the inclusion probability. Records having a key mapping to
822  * bucket zero are output. Buckets are equal size and therefore may be larger than the
823  * inclusion probability. (The other approach would be to have the caller specify the
824  * the number of buckets. More correct, but less convenient.)
825  */
826 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
827     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
828 if (isOutputRange!(OutputRange, char))
829 {
830     import std.algorithm : splitter;
831     import std.conv : to;
832     import std.digest.murmurhash;
833     import std.math : lrint;
834     import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange,
835         InputFieldReordering, InputSourceRange, throwIfWindowsNewlineOnUnix;
836 
837     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
838     else assert(!cmdopt.genRandomInorder);
839 
840     assert(cmdopt.keyFields.length > 0);
841     assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0);
842 
843     assert(!cmdopt.inputSources.empty);
844     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
845 
846     static if (generateRandomAll)
847     {
848         import std.format : formatValue, singleSpec;
849         immutable randomValueFormatSpec = singleSpec("%d");
850     }
851 
852     immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys.
853 
854     uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint;
855 
856     /* Create a mapping for the key fields. */
857     auto keyFieldsReordering = cmdopt.distinctKeyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields);
858 
859     /* First header is read during command line argument processing. */
860     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
861     {
862         auto inputStream = cmdopt.inputSources.front;
863 
864         static if (generateRandomAll)
865         {
866             outputStream.put(cmdopt.randomValueHeader);
867             outputStream.put(cmdopt.delim);
868         }
869         else if (cmdopt.printRandom)
870         {
871             outputStream.put(cmdopt.randomValueHeader);
872             outputStream.put(cmdopt.delim);
873         }
874 
875         outputStream.put(inputStream.header);
876         outputStream.put("\n");
877 
878         /* Immediately flush the header so subsequent processes in a unix command
879          * pipeline see it early. This helps provide timely error messages.
880          */
881         static if (isFlushableOutputRange!OutputRange) outputStream.flush;
882     }
883 
884     /* Process each line. */
885     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
886     ulong numLinesWritten = 0;
887 
888     foreach (inputStream; cmdopt.inputSources)
889     {
890         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
891 
892         foreach (ulong fileLineNum, line;
893                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
894         {
895             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
896 
897             /* Murmurhash works by successively adding individual keys, then finalizing.
898              * Adding individual keys is simpler if the full-line-as-key and individual
899              * fields as keys cases are separated.
900              */
901             auto hasher = MurmurHash3!32(cmdopt.seed);
902 
903             if (cmdopt.distinctKeyIsFullLine)
904             {
905                 hasher.put(cast(ubyte[]) line);
906             }
907             else
908             {
909                 assert(keyFieldsReordering !is null);
910 
911                 /* Gather the key field values and assemble the key. */
912                 keyFieldsReordering.initNewLine;
913                 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
914                 {
915                     keyFieldsReordering.processNextField(fieldIndex, fieldValue);
916                     if (keyFieldsReordering.allFieldsFilled) break;
917                 }
918 
919                 enforce(keyFieldsReordering.allFieldsFilled,
920                         format("Not enough fields in line. File: %s, Line: %s",
921                                inputStream.name, fileLineNum));
922 
923                 foreach (count, key; keyFieldsReordering.outputFields.enumerate)
924                 {
925                     if (count > 0) hasher.put(delimArray);
926                     hasher.put(cast(ubyte[]) key);
927                 }
928             }
929 
930             hasher.finish;
931 
932             static if (generateRandomAll)
933             {
934                 import std.conv : to;
935                 outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec);
936                 outputStream.put(cmdopt.delim);
937                 outputStream.put(line);
938                 outputStream.put("\n");
939 
940                 if (cmdopt.sampleSize != 0)
941                 {
942                     ++numLinesWritten;
943                     if (numLinesWritten == cmdopt.sampleSize) return;
944                 }
945             }
946             else if (hasher.get % numBuckets == 0)
947             {
948                 if (cmdopt.printRandom)
949                 {
950                     outputStream.put('0');
951                     outputStream.put(cmdopt.delim);
952                 }
953                 outputStream.put(line);
954                 outputStream.put("\n");
955 
956                 if (cmdopt.sampleSize != 0)
957                 {
958                     ++numLinesWritten;
959                     if (numLinesWritten == cmdopt.sampleSize) return;
960                 }
961             }
962         }
963     }
964 }
965 
966 /** Random sampling command handler. Invokes the appropriate sampling routine based on
967  * the command line arguments.
968  *
969  * Random sampling selects a fixed size random sample from the input stream. Both
970  * simple random sampling (equal likelihood) and weighted random sampling are
971  * supported. Selected lines are output either in random order or original input order.
972  * For weighted sampling the random order is the weighted selection order.
973  *
974  * Two algorithms are used, reservoir sampling via a heap and reservoir sampling via
975  * Algorithm R. This routine selects the appropriate reservoir sampling function and
976  * template instantiation to based on the command line arguments.
977  *
978  * Weighted sampling always uses the heap approach. Compatibility mode does as well,
979  * as it is the method that uses per-line random value assignments. The implication
980  * of compatibility mode is that a larger sample size includes all the results from
981  * a smaller sample, assuming the same random seed is used.
982  *
983  * For unweighted sampling there is a performance tradeoff between implementations.
984  * Heap-based sampling is faster for small sample sizes. Algorithm R is faster for
985  * large sample sizes. The threshold used was chosen based on performance tests. See
986  * the reservoirSamplingAlgorithmR documentation for more information.
987  */
988 
989 void randomSamplingCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
990 if (isOutputRange!(OutputRange, char))
991 {
992     assert(cmdopt.sampleSize != 0);
993 
994     immutable size_t algorithmRSampleSizeThreshold = 128 * 1024;
995 
996     if (cmdopt.hasWeightField)
997     {
998         if (cmdopt.preserveInputOrder)
999         {
1000             reservoirSamplingViaHeap!(Yes.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream);
1001         }
1002         else
1003         {
1004             reservoirSamplingViaHeap!(Yes.isWeighted, No.preserveInputOrder)(cmdopt, outputStream);
1005         }
1006     }
1007     else if (cmdopt.compatibilityMode ||
1008              (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR))
1009     {
1010         if (cmdopt.preserveInputOrder)
1011         {
1012             reservoirSamplingViaHeap!(No.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream);
1013         }
1014         else
1015         {
1016             reservoirSamplingViaHeap!(No.isWeighted, No.preserveInputOrder)(cmdopt, outputStream);
1017         }
1018     }
1019     else if (cmdopt.preserveInputOrder)
1020     {
1021         reservoirSamplingAlgorithmR!(Yes.preserveInputOrder)(cmdopt, outputStream);
1022     }
1023     else
1024     {
1025         reservoirSamplingAlgorithmR!(No.preserveInputOrder)(cmdopt, outputStream);
1026     }
1027 }
1028 
1029 /** Reservoir sampling using a heap. Both weighted and unweighted random sampling are
1030  * supported.
1031  *
1032  * The algorithm used here is based on the one-pass algorithm described by Pavlos
1033  * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S.
1034  * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are
1035  * simply set to one.
1036  *
1037  * The implementation uses a heap (priority queue) large enough to hold the desired
1038  * number of lines. Input is read line-by-line, assigned a random value, and added to
1039  * the heap. The role of the heap is to identify the lines with the highest assigned
1040  * random values. Once the heap is full, adding a new line means dropping the line with
1041  * the lowest score. A "min" heap used for this reason.
1042  *
1043  * When done reading all lines, the "min" heap is in reverse of weighted selection
1044  * order. Weighted selection order is obtained by removing each element one at at time
1045  * from the heap. The underlying data store will have the elements in weighted selection
1046  * order (largest weights first).
1047  *
1048  * Generating output in weighted order is useful for several reasons:
1049  *  - For weighted sampling, it preserves the property that smaller valid subsets can be
1050  *    created by taking the first N lines.
1051  *  - For unweighted sampling, it ensures that all output permutations are possible, and
1052  *    are not influenced by input order or the heap data structure used.
1053  *  - Order consistency is maintained when making repeated use of the same random seed,
1054  *    but with different sample sizes.
1055  *
1056  * The other choice is preserving input order. This is supporting by recording line
1057  * numbers and sorting the selected sample.
1058  *
1059  * There are use cases where only the selection set matters. For these some performance
1060  * could be gained by skipping the reordering and simply printing the backing store
1061  * array in-order. Performance tests indicate only a minor benefit, so this is not
1062  * supported.
1063  *
1064  * Notes:
1065  * $(LIST
1066  *    * In tsv-sample versions 1.2.1 and earlier this routine also supported
1067  *      randomization of all input lines. This was dropped in version 1.2.2 in favor
1068  *      of the approach used in randomizeLines. The latter has significant advantages
1069  *      given that all data must be read into memory.
1070  *    * For large reservoir sizes better performance can be achieved using Algorithm R.
1071  *      See the reservoirSamplingAlgorithmR documentation for details.
1072  * )
1073  */
1074 void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, Flag!"preserveInputOrder" preserveInputOrder, OutputRange)
1075     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1076 if (isOutputRange!(OutputRange, char))
1077 {
1078     import std.algorithm : sort;
1079     import std.container.array;
1080     import std.container.binaryheap;
1081     import std.meta : AliasSeq;
1082     import std.random : Random = Mt19937, uniform01;
1083     import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange,
1084         InputSourceRange, throwIfWindowsNewlineOnUnix;
1085 
1086     static if (isWeighted) assert(cmdopt.hasWeightField);
1087     else assert(!cmdopt.hasWeightField);
1088 
1089     assert(cmdopt.sampleSize > 0);
1090 
1091     assert(!cmdopt.inputSources.empty);
1092     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1093 
1094     auto randomGenerator = Random(cmdopt.seed);
1095 
1096     static struct Entry(Flag!"preserveInputOrder" preserveInputOrder)
1097     {
1098         double score;
1099         const(char)[] line;
1100         static if (preserveInputOrder) ulong lineNumber;
1101     }
1102 
1103     /* Create the heap and backing data store.
1104      *
1105      * Note: An std.container.array is used as the backing store to avoid some issues in
1106      * the standard library (Phobos) binaryheap implementation. Specifically, when an
1107      * std.container.array is used as backing store, the heap can efficiently reversed by
1108      * removing the heap elements. This leaves the backing store in the reversed order.
1109      * However, the current binaryheap implementation does not support this for all
1110      * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094.
1111      */
1112 
1113     Array!(Entry!preserveInputOrder) dataStore;
1114     dataStore.reserve(cmdopt.sampleSize);
1115     auto reservoir = dataStore.heapify!("a.score > b.score")(0);  // Min binaryheap
1116 
1117     /* First header is read during command line argument processing. */
1118     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1119     {
1120         auto inputStream = cmdopt.inputSources.front;
1121 
1122         if (cmdopt.printRandom)
1123         {
1124             outputStream.put(cmdopt.randomValueHeader);
1125             outputStream.put(cmdopt.delim);
1126         }
1127         outputStream.put(inputStream.header);
1128         outputStream.put("\n");
1129 
1130         /* Immediately flush the header so subsequent processes in a unix command
1131          * pipeline see it early. This helps provide timely error messages.
1132          */
1133         static if (isFlushableOutputRange!OutputRange) outputStream.flush;
1134     }
1135 
1136     /* Process each line. */
1137     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
1138     static if (preserveInputOrder) ulong totalLineNum = 0;
1139 
1140     foreach (inputStream; cmdopt.inputSources)
1141     {
1142         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1143 
1144         foreach (ulong fileLineNum, line;
1145                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
1146         {
1147             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
1148 
1149             static if (!isWeighted)
1150             {
1151                 immutable double lineScore = uniform01(randomGenerator);
1152             }
1153             else
1154             {
1155                 immutable double lineWeight =
1156                     getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, inputStream.name, fileLineNum);
1157                 immutable double lineScore =
1158                     (lineWeight > 0.0)
1159                     ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1160                     : 0.0;
1161             }
1162 
1163             static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum);
1164             else alias entryCTArgs = AliasSeq!();
1165 
1166             if (reservoir.length < cmdopt.sampleSize)
1167             {
1168                 reservoir.insert(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs));
1169             }
1170             else if (reservoir.front.score < lineScore)
1171             {
1172                 reservoir.replaceFront(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs));
1173             }
1174 
1175             static if (preserveInputOrder) ++totalLineNum;
1176         }
1177     }
1178 
1179     /* Done with input, all entries are in the reservoir. */
1180 
1181     /* The asserts here avoid issues with the current binaryheap implementation. They
1182      * detect use of backing stores having a length not synchronized to the reservoir.
1183      */
1184     immutable ulong numLines = reservoir.length;
1185     assert(numLines == dataStore.length);
1186 
1187     /* Update the backing store so it is in the desired output order.
1188      */
1189     static if (preserveInputOrder)
1190     {
1191         dataStore[].sort!((a, b) => a.lineNumber < b.lineNumber);
1192     }
1193     else
1194     {
1195         /* Output in weighted selection order. The heap is in reverse order of assigned
1196          * weights. Reversing order is done by removing all elements from the heap. This
1197          * leaves the backing store in the correct order.
1198          */
1199         while (!reservoir.empty) reservoir.removeFront;
1200     }
1201 
1202     assert(numLines == dataStore.length);
1203 
1204     foreach (entry; dataStore)
1205     {
1206         if (cmdopt.printRandom)
1207         {
1208             outputStream.formatRandomValue(entry.score);
1209             outputStream.put(cmdopt.delim);
1210         }
1211         outputStream.put(entry.line);
1212         outputStream.put("\n");
1213     }
1214  }
1215 
1216 /** Generate weighted random values for all input lines, preserving input order.
1217  *
1218  * This complements weighted reservoir sampling, but instead of using a reservoir it
1219  * simply iterates over the input lines generating the values. The weighted random
1220  * values are generated with the same formula used by reservoirSampling.
1221  */
1222 void generateWeightedRandomValuesInorder(OutputRange)
1223     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1224 if (isOutputRange!(OutputRange, char))
1225 {
1226     import std.random : Random = Mt19937, uniform01;
1227     import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange,
1228         InputSourceRange, throwIfWindowsNewlineOnUnix;
1229 
1230     assert(cmdopt.hasWeightField);
1231 
1232     assert(!cmdopt.inputSources.empty);
1233     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1234 
1235     auto randomGenerator = Random(cmdopt.seed);
1236 
1237     /* First header is read during command line argument processing. */
1238     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1239     {
1240         auto inputStream = cmdopt.inputSources.front;
1241 
1242         outputStream.put(cmdopt.randomValueHeader);
1243         outputStream.put(cmdopt.delim);
1244         outputStream.put(inputStream.header);
1245         outputStream.put("\n");
1246 
1247         /* Immediately flush the header so subsequent processes in a unix command
1248          * pipeline see it early. This helps provide timely error messages.
1249          */
1250         static if (isFlushableOutputRange!OutputRange) outputStream.flush;
1251     }
1252 
1253     /* Process each line. */
1254     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
1255     ulong numLinesWritten = 0;
1256 
1257     foreach (inputStream; cmdopt.inputSources)
1258     {
1259         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1260 
1261         foreach (ulong fileLineNum, line;
1262                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
1263         {
1264             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
1265 
1266             immutable double lineWeight =
1267                 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, inputStream.name, fileLineNum);
1268 
1269             immutable double lineScore =
1270                 (lineWeight > 0.0)
1271                 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1272                 : 0.0;
1273 
1274             outputStream.formatRandomValue(lineScore);
1275             outputStream.put(cmdopt.delim);
1276             outputStream.put(line);
1277             outputStream.put("\n");
1278 
1279             if (cmdopt.sampleSize != 0)
1280             {
1281                 ++numLinesWritten;
1282                 if (numLinesWritten == cmdopt.sampleSize) return;
1283             }
1284         }
1285     }
1286 }
1287 
1288 /** Reservoir sampling via Algorithm R
1289  *
1290  * This is an implementation of reservoir sampling using what is commonly known as
1291  * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of
1292  * Computer Programming, Volume 2: Seminumerical Algorithms". More information about
1293  * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with
1294  * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling"
1295  * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R).
1296  *
1297  * Algorithm R is used for unweighted sampling without replacement. The heap-based
1298  * algorithm in reservoirSamplingViaHeap is used for weighted sampling.
1299  *
1300  * The classic algorithm stops after identifying the selected set of items. This
1301  * implementation goes one step further and randomizes the order of the selected
1302  * lines. This is consistent with shuffling (line order randomization), a primary
1303  * tsv-sample use-case.
1304  *
1305  * This algorithm is faster than reservoirSamplingViaHeap when the sample size
1306  * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size.
1307  * Insertion in this algorithm is O(1). Similarly, generating the random order in the
1308  * heap is O(k * log k), while in this algorithm the final randomization step is O(k).
1309  *
1310  * This speed advantage may be offset a certain amount by using a more expensive random
1311  * value generator. reservoirSamplingViaHeap generates values between zero and one,
1312  * whereas reservoirSamplingAlgorithmR generates random integers over and ever growing
1313  * interval. The latter is expected to be more expensive. This is consistent with
1314  * performance tests indicating that reservoirSamplingViaHeap is faster when using
1315  * small-to-medium size reservoirs and large input streams.
1316  */
1317 void reservoirSamplingAlgorithmR(Flag!"preserveInputOrder" preserveInputOrder, OutputRange)
1318     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1319 if (isOutputRange!(OutputRange, char))
1320 {
1321     import std.meta : AliasSeq;
1322     import std.random : Random = Mt19937, randomShuffle, uniform;
1323     import std.algorithm : sort;
1324     import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange,
1325         InputSourceRange, throwIfWindowsNewlineOnUnix;
1326 
1327     assert(cmdopt.sampleSize > 0);
1328     assert(!cmdopt.hasWeightField);
1329     assert(!cmdopt.compatibilityMode);
1330     assert(!cmdopt.printRandom);
1331     assert(!cmdopt.genRandomInorder);
1332 
1333     assert(!cmdopt.inputSources.empty);
1334     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1335 
1336     static struct Entry(Flag!"preserveInputOrder" preserveInputOrder)
1337     {
1338         const(char)[] line;
1339         static if (preserveInputOrder) ulong lineNumber;
1340     }
1341 
1342     Entry!preserveInputOrder[] reservoir;
1343     auto reservoirAppender = appender(&reservoir);
1344     reservoirAppender.reserve(cmdopt.sampleSize);
1345 
1346     auto randomGenerator = Random(cmdopt.seed);
1347 
1348     /* First header is read during command line argument processing. */
1349     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1350     {
1351         auto inputStream = cmdopt.inputSources.front;
1352 
1353         outputStream.put(inputStream.header);
1354         outputStream.put("\n");
1355 
1356         /* Immediately flush the header so subsequent processes in a unix command
1357          * pipeline see it early. This helps provide timely error messages.
1358          */
1359         static if (isFlushableOutputRange!OutputRange) outputStream.flush;
1360     }
1361 
1362     /* Process each line. */
1363     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
1364     ulong totalLineNum = 0;
1365 
1366     foreach (inputStream; cmdopt.inputSources)
1367     {
1368         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1369 
1370         foreach (ulong fileLineNum, line;
1371                  inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine))
1372         {
1373             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
1374 
1375             /* Add lines to the reservoir until the reservoir is filled.
1376              * After that lines are added with decreasing likelihood, based on
1377              * the total number of lines seen. If added to the reservoir, the
1378              * line replaces a randomly chosen existing line.
1379              */
1380             static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum);
1381             else alias entryCTArgs = AliasSeq!();
1382 
1383             if (totalLineNum < cmdopt.sampleSize)
1384             {
1385                 reservoirAppender ~= Entry!preserveInputOrder(line.idup, entryCTArgs);
1386             }
1387             else
1388             {
1389                 immutable size_t i = uniform(0, totalLineNum, randomGenerator);
1390                 if (i < reservoir.length)
1391                 {
1392                     reservoir[i] = Entry!preserveInputOrder(line.idup, entryCTArgs);
1393                 }
1394             }
1395 
1396             ++totalLineNum;
1397         }
1398     }
1399 
1400     /* Done with input. The sample is in the reservoir. Update the order and print. */
1401 
1402     static if (preserveInputOrder)
1403     {
1404         reservoir.sort!((a, b) => a.lineNumber < b.lineNumber);
1405     }
1406     else
1407     {
1408         reservoir.randomShuffle(randomGenerator);
1409     }
1410 
1411     foreach (ref entry; reservoir)
1412     {
1413         outputStream.put(entry.line);
1414         outputStream.put("\n");
1415     }
1416 }
1417 
1418 /** Shuffling command handler. Invokes the appropriate shuffle (line order
1419  * randomization) routine based on the command line arguments.
1420  *
1421  * Shuffling has similarities to random sampling, but the algorithms used are
1422  * different. Random sampling selects a subset, only the current subset selection
1423  * needs to be kept in memory. This is supported by reservoir sampling. By contrast,
1424  * shuffling needs to hold all input in memory, so it works better to read all lines
1425  * into memory at once and then shuffle.
1426  *
1427  * Two different algorithms are used. Array shuffling is used for unweighted shuffling.
1428  * Sorting plus random weight assignments is used for weighted shuffling and when
1429  * compatibility mode is being used.
1430  *
1431  * The algorithms used here are all limited by available memory.
1432  */
1433 void shuffleCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1434 if (isOutputRange!(OutputRange, char))
1435 {
1436     if (cmdopt.hasWeightField)
1437     {
1438         randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream);
1439     }
1440     else if (cmdopt.compatibilityMode)
1441     {
1442         randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream);
1443     }
1444     else
1445     {
1446         randomizeLinesViaShuffle(cmdopt, outputStream);
1447     }
1448 }
1449 
1450 /** Shuffle all input lines by assigning random weights and sorting.
1451  *
1452  * randomizeLinesViaSort reads in all input lines and writes them out in random order.
1453  * The algorithm works by assigning a random value to each line and sorting. Both
1454  * weighted and unweighted shuffling are supported.
1455  *
1456  * Notes:
1457  * $(LIST
1458  *   * For unweighted shuffling randomizeLinesViaShuffle is faster and should be used
1459  *     unless compatibility mode is needed.
1460  *   * This routine is significantly faster than heap-based reservoir sampling in the
1461  *     case where the entire file is being read.
1462  *   * Input data must be read entirely in memory. Disk oriented techniques are needed
1463  *     when data sizes get too large for available memory. One option is to generate
1464  *     random values for each line, e.g. --gen-random-inorder, and sort with a disk-
1465  *     backed sort program like GNU sort.
1466  * )
1467  */
1468 void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange)
1469     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1470 if (isOutputRange!(OutputRange, char))
1471 {
1472     import std.algorithm : map, sort;
1473 
1474     static if (isWeighted) assert(cmdopt.hasWeightField);
1475     else assert(!cmdopt.hasWeightField);
1476 
1477     assert(cmdopt.sampleSize == 0);
1478 
1479     /*
1480      * Read all file data into memory. Then split the data into lines and assign a
1481      * random value to each line. readFileData also writes the first header line.
1482      */
1483     const fileData = readFileData!(Yes.hasRandomValue)(cmdopt, outputStream);
1484     auto inputLines = fileData.identifyInputLines!(Yes.hasRandomValue, isWeighted)(cmdopt);
1485 
1486     /*
1487      * Sort by the weight and output the lines.
1488      */
1489     inputLines.sort!((a, b) => a.randomValue > b.randomValue);
1490 
1491     foreach (lineEntry; inputLines)
1492     {
1493         if (cmdopt.printRandom)
1494         {
1495             outputStream.formatRandomValue(lineEntry.randomValue);
1496             outputStream.put(cmdopt.delim);
1497         }
1498         outputStream.put(lineEntry.data);
1499         outputStream.put("\n");
1500     }
1501 }
1502 
1503 /** Shuffle (randomize) all input lines using a shuffling algorithm.
1504  *
1505  * All lines in files and/or standard input are read in and written out in random
1506  * order. This routine uses array shuffling, which is faster than sorting. It is a
1507  * good alternative to randomizeLinesViaSort when doing unweighted shuffling (the
1508  * most common case).
1509  *
1510  * Input data size is limited by available memory. Disk oriented techniques are needed
1511  * when data sizes are larger. For example, generating random values line-by-line (ala
1512  * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort.
1513  *
1514  * This routine does not support random value printing or compatibility-mode.
1515  */
1516 void randomizeLinesViaShuffle(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1517 if (isOutputRange!(OutputRange, char))
1518 {
1519     import std.algorithm : map;
1520     import std.random : Random = Mt19937, randomShuffle;
1521 
1522     assert(cmdopt.sampleSize == 0);
1523     assert(!cmdopt.hasWeightField);
1524     assert(!cmdopt.printRandom);
1525     assert(!cmdopt.genRandomInorder);
1526 
1527     /*
1528      * Read all file data into memory and split into lines.
1529      */
1530     const fileData = readFileData!(No.hasRandomValue)(cmdopt, outputStream);
1531     auto inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt);
1532 
1533     /*
1534      * Randomly shuffle and print each line.
1535      *
1536      * Note: Also tried randomCover, but that was exceedingly slow.
1537      */
1538     import std.random : randomShuffle;
1539 
1540     auto randomGenerator = Random(cmdopt.seed);
1541     inputLines.randomShuffle(randomGenerator);
1542 
1543     foreach (ref line; inputLines)
1544     {
1545         outputStream.put(line.data);
1546         outputStream.put("\n");
1547     }
1548 }
1549 
1550 /** Simple random sampling with replacement.
1551  *
1552  * All lines in files and/or standard input are read in. Then random lines are selected
1553  * one at a time and output. Lines can be selected multiple times. This process continues
1554  * until the desired number of samples (--n|num) has been output. Output continues
1555  * indefinitely if a sample size was not provided.
1556  */
1557 void simpleRandomSamplingWithReplacement(OutputRange)
1558     (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1559 if (isOutputRange!(OutputRange, char))
1560 {
1561     import std.algorithm : map;
1562     import std.random : Random = Mt19937, uniform;
1563 
1564     /*
1565      * Read all file data into memory and split the data into lines.
1566      */
1567     const fileData = readFileData!(No.hasRandomValue)(cmdopt, outputStream);
1568     const inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt);
1569 
1570     if (inputLines.length > 0)
1571     {
1572         auto randomGenerator = Random(cmdopt.seed);
1573 
1574         /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */
1575         size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize;
1576         while (numLeft != 0)
1577         {
1578             immutable size_t index = uniform(0, inputLines.length, randomGenerator);
1579             outputStream.put(inputLines[index].data);
1580             outputStream.put("\n");
1581             if (cmdopt.sampleSize != 0) numLeft--;
1582         }
1583     }
1584 }
1585 
1586 /** A container holding data read from a file or standard input.
1587  *
1588  * The InputBlock struct is used to represent a block of data read from a file or
1589  * standard input. An array of InputBlocks is returned by readFileData. Typically one
1590  * block per file. Multiple blocks are used for standard input and when the file size
1591  * cannot be determined. Individual lines are not allowed to span blocks. The blocks
1592  * allocated to an individual file are numbered starting with zero.
1593  *
1594  * See readFileData() for more information.
1595  */
1596 static struct InputBlock
1597 {
1598     string filename;          /// Original filename or path. "-" denotes standard input.
1599     size_t fileBlockNumber;   /// Zero-based block number for the file.
1600     char[] data;              /// The actual data. Newline terminated or last block for the file.
1601 }
1602 
1603 /** Read data from one or more files. This routine is used by algorithms needing to
1604  * read all data into memory.
1605  *
1606  * readFileData reads in all data from a set of files. Data is returned as an array
1607  * of InputBlock structs. Normally one InputBlock per file, sized to match the size
1608  * of the file. Standard input is read in one or more blocks, as are files whose size
1609  * cannot be determined. Multiple blocks are used in these last two cases to avoid
1610  * expensive memory reallocations. This is not necessary when file size is known as
1611  * the necessary memory can be preallocated.
1612  *
1613  * Individual lines never span multiple blocks, and newlines are preserved. This
1614  * means that each block starts at the beginning of a line and ends with a newline
1615  * unless the end of a file has been reached.
1616  *
1617  * Each file gets its own block. Prior to using InputSourceRange this was so header
1618  * processing can be done. With InputSourceRange the header is read separately, so
1619  * this could be changed.
1620  */
1621 InputBlock[] readFileData(HasRandomValue hasRandomValue, OutputRange)
1622 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
1623 if (isOutputRange!(OutputRange, char))
1624 {
1625     import std.algorithm : find, min;
1626     import std.range : retro;
1627     import tsv_utils.common.utils : InputSourceRange, isFlushableOutputRange,
1628         throwIfWindowsNewlineOnUnix;
1629 
1630     static if(!hasRandomValue) assert(!cmdopt.printRandom);
1631 
1632     assert(!cmdopt.inputSources.empty);
1633     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1634 
1635     /* First header is read during command line argument processing. */
1636     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
1637     {
1638         auto inputStream = cmdopt.inputSources.front;
1639 
1640         if (cmdopt.printRandom)
1641         {
1642             outputStream.put(cmdopt.randomValueHeader);
1643             outputStream.put(cmdopt.delim);
1644         }
1645         outputStream.put(inputStream.header);
1646         outputStream.put("\n");
1647 
1648         /* Immediately flush the header so subsequent processes in a unix command
1649          * pipeline see it early. This helps provide timely error messages.
1650          */
1651         static if (isFlushableOutputRange!OutputRange) outputStream.flush;
1652     }
1653 
1654     enum BlockSize = 1024L * 1024L * 1024L;  // 1 GB. ('L' notation avoids overflow w/ 2GB+ sizes.)
1655     enum ReadSize = 1024L * 128L;
1656     enum NewlineSearchSize = 1024L * 16L;
1657 
1658     InputBlock[] blocks;
1659     auto blocksAppender = appender(&blocks);
1660     blocksAppender.reserve(cmdopt.inputSources.length);  // At least one block per file.
1661 
1662     ubyte[] rawReadBuffer = new ubyte[ReadSize];
1663 
1664     foreach (inputStream; cmdopt.inputSources)
1665     {
1666         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
1667 
1668         /* If the file size can be determined then read it as a single block.
1669          * Otherwise read as multiple blocks. File.size() returns ulong.max
1670          * if file size cannot be determined, so we'll combine that check
1671          * with the standard input case.
1672          */
1673 
1674         immutable ulong filesize = inputStream.isStdin ? ulong.max : inputStream.file.size;
1675         auto ifile = inputStream.file;
1676 
1677         if (filesize != ulong.max)
1678         {
1679             readFileDataAsOneBlock(inputStream.name, ifile, filesize,
1680                                    blocksAppender, rawReadBuffer);
1681         }
1682         else
1683         {
1684             readFileDataAsMultipleBlocks(
1685                 inputStream.name, ifile, blocksAppender, rawReadBuffer,
1686                 BlockSize, NewlineSearchSize);
1687         }
1688     }
1689     return blocks;
1690 }
1691 
1692 /* readFileData() helper function. Read data from a File handle as a single block. The
1693  * new block is appended to an existing InputBlock[] array.
1694  *
1695  * readFileDataAsOneBlocks is part of the readFileData logic. It handles the case
1696  * where a file is being read as a single block. Normally initialBlockSize is passed
1697  * as the size of the file.
1698  *
1699  * This routine has been separated out to enable unit testing. At present it is not
1700  * intended as a general API. See readFileData for more info.
1701  */
1702 private void readFileDataAsOneBlock(
1703     string filename,
1704     ref File ifile,
1705     const ulong initialBlockSize,
1706     ref RefAppender!(InputBlock[]) blocksAppender,
1707     ref ubyte[] rawReadBuffer)
1708 {
1709     blocksAppender.put(InputBlock(filename, 0));
1710     auto dataAppender = appender(&(blocksAppender.data[$-1].data));
1711     dataAppender.reserve(initialBlockSize);
1712 
1713     foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer))
1714     {
1715         dataAppender.put(cast(char[]) buffer);
1716     }
1717 }
1718 
1719 /* readFileData() helper function. Read data from a File handle as one or more blocks.
1720  * Blocks are appended to an existing InputBlock[] array.
1721  *
1722  * readFileDataAsMultipleBlocks is part of the readFileData logic. It handles the case
1723  * where a file or standard input is being read as a series of blocks. This is the
1724  * standard approach for standard input, but also applies when the file size cannot be
1725  * determined.
1726  *
1727  * This routine has been separated out to enable unit testing. At present it is not
1728  * intended as a general API. See readFileData for more info.
1729  */
1730 private void readFileDataAsMultipleBlocks(
1731     string filename,
1732     ref File ifile,
1733     ref RefAppender!(InputBlock[]) blocksAppender,
1734     ref ubyte[] rawReadBuffer,
1735     const size_t blockSize,
1736     const size_t newlineSearchSize)
1737 {
1738     import std.algorithm : find, min;
1739     import std.range : retro;
1740 
1741     assert(ifile.isOpen);
1742 
1743     /* Create a new block for the file and an Appender for writing data.
1744      */
1745     blocksAppender.put(InputBlock(filename, 0));
1746     auto dataAppender = appender(&(blocksAppender.data[$-1].data));
1747     dataAppender.reserve(blockSize);
1748     size_t blockNumber = 0;
1749 
1750     /* Read all the data and copy it to an InputBlock. */
1751     foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer))
1752     {
1753         assert(blockNumber == blocksAppender.data[$-1].fileBlockNumber);
1754 
1755         immutable size_t remainingCapacity = dataAppender.capacity - dataAppender.data.length;
1756 
1757         if (buffer.length <= remainingCapacity)
1758         {
1759             dataAppender.put(cast(char[]) buffer);
1760         }
1761         else
1762         {
1763             /* Look for the last newline in the input buffer that fits in remaining
1764              * capacity of the block.
1765              */
1766             auto searchRegion = buffer[0 .. remainingCapacity];
1767             auto appendRegion = searchRegion.retro.find('\n').source;
1768 
1769             if (appendRegion.length > 0)
1770             {
1771                 /* Copy the first part of the read buffer to the block. */
1772                 dataAppender.put(cast(char[]) appendRegion);
1773 
1774                 /* Create a new InputBlock and copy the remaining data to it. */
1775                 blockNumber++;
1776                 blocksAppender.put(InputBlock(filename, blockNumber));
1777                 dataAppender = appender(&(blocksAppender.data[$-1].data));
1778                 dataAppender.reserve(blockSize);
1779                 dataAppender.put(cast(char[]) buffer[appendRegion.length .. $]);
1780 
1781                 assert(blocksAppender.data.length >= 2);
1782                 assert(blocksAppender.data[$-2].data[$-1] == '\n');
1783             }
1784             else
1785             {
1786                 /* Search backward in the current block for a newline. If found, it
1787                  * becomes the last newline in the current block. Anything following
1788                  * it is moved to the block. If a newline is not found, simply append
1789                  * to the current block and let it grow. We'll only search backward
1790                  * so far.
1791                  */
1792                 immutable size_t currBlockLength = blocksAppender.data[$-1].data.length;
1793                 immutable size_t searchLength = min(currBlockLength, newlineSearchSize);
1794                 immutable size_t searchStart = currBlockLength - searchLength;
1795                 auto blockSearchRegion = blocksAppender.data[$-1].data[searchStart .. $];
1796                 auto lastNewlineOffset = blockSearchRegion.retro.find('\n').source.length;
1797 
1798                 if (lastNewlineOffset != 0)
1799                 {
1800                     /* Create a new InputBlock. The previous InputBlock is then found
1801                      * at blocksAppender.data[$-2]. It may be a physically different
1802                      * struct (a copy) if the blocks array gets reallocated.
1803                      */
1804                     blockNumber++;
1805                     blocksAppender.put(InputBlock(filename, blockNumber));
1806                     dataAppender = appender(&(blocksAppender.data[$-1].data));
1807                     dataAppender.reserve(blockSize);
1808 
1809                     /* Copy data following the newline from the last block to the new
1810                      * block. Then append the current read buffer.
1811                      */
1812                     immutable size_t moveRegionStart = searchStart + lastNewlineOffset;
1813                     dataAppender.put(blocksAppender.data[$-2].data[moveRegionStart .. $]);
1814                     dataAppender.put(cast(char[]) buffer);
1815 
1816                     /* Now delete the moved region from the last block. */
1817                     blocksAppender.data[$-2].data.length = moveRegionStart;
1818 
1819                     assert(blocksAppender.data.length >= 2);
1820                     assert(blocksAppender.data[$-2].data[$-1] == '\n');
1821                 }
1822                 else
1823                 {
1824                     /* Give up. Allow the current block to grow. */
1825                     dataAppender.put(cast(char[]) buffer);
1826                 }
1827             }
1828         }
1829     }
1830 }
1831 
1832 /** HasRandomValue is a boolean flag used at compile time by identifyInputLines to
1833  * distinguish use cases needing random value assignments from those that don't.
1834  */
1835 alias HasRandomValue = Flag!"hasRandomValue";
1836 
1837 /** An InputLine array is returned by identifyInputLines to represent each non-header line
1838  * line found in a FileData array. The 'data' element contains the line. A 'randomValue'
1839  * line is included if random values are being generated.
1840  */
1841 static struct InputLine(HasRandomValue hasRandomValue)
1842 {
1843     const(char)[] data;
1844     static if (hasRandomValue) double randomValue;
1845 }
1846 
1847 /** identifyInputLines is used by algorithms that read all files into memory prior to
1848  * processing. It does the initial processing of the file data.
1849  *
1850  * Two main tasks are performed. One is splitting all input data into lines. The second
1851  * is assigning a random value to the line, if random values are being generated.
1852  *
1853  * The key input is an InputBlock array. Normally one block for each file, but standard
1854  * input may have multiple blocks.
1855  *
1856  * The return value is an array of InputLine structs. The struct will have a 'randomValue'
1857  * member if random values are being assigned.
1858  */
1859 InputLine!hasRandomValue[] identifyInputLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted)
1860 (const ref InputBlock[] inputBlocks, ref TsvSampleOptions cmdopt)
1861 {
1862     import std.algorithm : splitter;
1863     import std.array : appender;
1864     import std.random : Random = Mt19937, uniform01;
1865     import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
1866 
1867     static assert(hasRandomValue || !isWeighted);
1868     static if(!hasRandomValue) assert(!cmdopt.printRandom);
1869 
1870     InputLine!hasRandomValue[] inputLines;
1871 
1872     auto linesAppender = appender(&inputLines);
1873     static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed);
1874 
1875     /* Note: fileLineNum is zero-based here. One-based in most other code in this file. */
1876     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 1 : 0;
1877     size_t fileLineNum = fileBodyStartLine;
1878 
1879     foreach (block; inputBlocks)
1880     {
1881         /* Drop the last newline to avoid adding an extra empty line. */
1882         const data = (block.data.length > 0 && block.data[$-1] == '\n') ?
1883             block.data[0 .. $-1] : block.data;
1884 
1885         if (block.fileBlockNumber == 0) fileLineNum = fileBodyStartLine;
1886 
1887         foreach (ref line; data.splitter('\n'))
1888         {
1889             fileLineNum++;
1890 
1891             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, block.filename, fileLineNum);
1892 
1893             static if (!hasRandomValue)
1894             {
1895                 linesAppender.put(InputLine!hasRandomValue(line));
1896             }
1897             else
1898             {
1899                 static if (!isWeighted)
1900                 {
1901                     immutable double randomValue = uniform01(randomGenerator);
1902                 }
1903                 else
1904                 {
1905                     immutable double lineWeight =
1906                         getFieldValue!double(line, cmdopt.weightField, cmdopt.delim,
1907                                              block.filename, fileLineNum);
1908                     immutable double randomValue =
1909                         (lineWeight > 0.0)
1910                         ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
1911                         : 0.0;
1912                 }
1913 
1914                 linesAppender.put(InputLine!hasRandomValue(line, randomValue));
1915             }
1916         }
1917     }
1918 
1919     return inputLines;
1920 }
1921 
1922 
1923 /* Unit tests for ReadFileData. These tests focus on multiple InputBlock scenarios.
1924  * Other use paths are well tested by the tests at the end cases.
1925  */
1926 unittest
1927 {
1928     import tsv_utils.common.unittest_utils;
1929     import std.algorithm : equal, find, joiner, splitter;
1930     import std.array : appender;
1931     import std.file : rmdirRecurse;
1932     import std.path : buildPath;
1933     import std.range : repeat;
1934 
1935     auto rfdTestDir = makeUnittestTempDir("tsv_sample_readFileData");
1936     scope(exit) rfdTestDir.rmdirRecurse;
1937 
1938     char[] file1Data;
1939     char[] file2Data;
1940     char[] file3Data;
1941 
1942     auto app1 = appender(&file1Data);
1943     auto app2 = appender(&file2Data);
1944     auto app3 = appender(&file3Data);
1945 
1946     /* File 1: 1000 short lines. */
1947     app1.put("\n".repeat(100).joiner);
1948     app1.put("x\n".repeat(100).joiner);
1949     app1.put("yz\n".repeat(100).joiner);
1950     app1.put("pqr\n".repeat(100).joiner);
1951     app1.put("a\nbc\ndef\n".repeat(100).joiner);
1952     app1.put('\n'.repeat(100));
1953     app1.put("z\n".repeat(100).joiner);
1954     app1.put("xy\n".repeat(100).joiner);
1955 
1956     /* File 2: 500 longer lines. */
1957     app2.put(
1958         "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n"
1959         .repeat(100)
1960         .joiner);
1961     app2.put(
1962         "|abcdefghijklmnopqrstuv|\n|0123456789|\n|0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ|\n|abcdefghijklmnopqrstuvwxyz|\n"
1963         .repeat(100)
1964         .joiner);
1965     app2.put(
1966          "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n"
1967         .repeat(100)
1968         .joiner);
1969 
1970     /* File 3: 1000 mixed length lines. */
1971     app3.put("\n\n|abcde|\n1\n12\n123\n|abcdefghijklmnop|\n|xyz|\n0123456789\nX\n".repeat(100).joiner);
1972 
1973     string file1Path = buildPath(rfdTestDir, "file1.txt");
1974     string file2Path = buildPath(rfdTestDir, "file2.txt");
1975     string file3Path = buildPath(rfdTestDir, "file3.txt");
1976 
1977     try
1978     {
1979         auto ofile1 = File(file1Path, "w");
1980         ofile1.write(file1Data);
1981     }
1982     catch (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file1Path, e.msg));
1983 
1984     try
1985     {
1986         auto ofile2 = File(file2Path, "w");
1987         ofile2.write(file2Data);
1988     }
1989     catch (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file2Path, e.msg));
1990 
1991     try
1992     {
1993         auto ofile3 = File(file3Path, "w");
1994         ofile3.write(file3Data);
1995     }
1996     catch  (Exception e) assert(false, format("Failed to write file: %s.\n  Error: %s", file3Path, e.msg));
1997 
1998     auto allData = file1Data ~ file2Data ~ file3Data;
1999     auto expectedLines = allData.splitter('\n').array[0 .. $-1];
2000 
2001     auto file2DataNoHeader = (file2Data.find('\n'))[1 .. $];
2002     auto file3DataNoHeader = (file3Data.find('\n'))[1 .. $];
2003     auto allDataUsingHeader = file1Data ~ file2DataNoHeader ~ file3DataNoHeader;
2004     auto expectedLinesUsingHeader = allDataUsingHeader.splitter('\n').array[0 .. $-1];
2005 
2006     assert(expectedLines.length == expectedLinesUsingHeader.length + 2);
2007 
2008     TsvSampleOptions cmdoptNoHeader;
2009     auto noHeaderCmdArgs = ["unittest", file1Path];
2010     auto r1 = cmdoptNoHeader.processArgs(noHeaderCmdArgs);
2011     assert(r1[0], format("Invalid command lines arg: '%s'.", noHeaderCmdArgs));
2012 
2013     TsvSampleOptions cmdoptYesHeader;
2014     auto yesHeaderCmdArgs = ["unittest", "--header", file1Path];
2015     auto r2 = cmdoptYesHeader.processArgs(yesHeaderCmdArgs);
2016     assert(r2[0], format("Invalid command lines arg: '%s'.", yesHeaderCmdArgs));
2017 
2018     auto outputStream = appender!(char[])();
2019 
2020     {
2021         /* Reading as single blocks. */
2022         ubyte[] rawReadBuffer = new ubyte[256];
2023         InputBlock[] blocks;
2024         auto blocksAppender = appender(&blocks);
2025         blocksAppender.reserve(3);
2026         foreach (f; [ file1Path, file2Path, file3Path ])
2027         {
2028             auto ifile = f.File;
2029             ulong filesize = ifile.size;
2030             if (filesize == ulong.max) filesize = 1000;
2031             readFileDataAsOneBlock(f, ifile, filesize, blocksAppender, rawReadBuffer);
2032         }
2033         auto inputLines =
2034             identifyInputLines!(No.hasRandomValue, No.isWeighted)(
2035                 blocks, cmdoptNoHeader);
2036 
2037         assert(equal!((a, b) => a.data == b)(inputLines, expectedLines));
2038     }
2039 
2040     {
2041         /* Reading as multiple blocks. */
2042         foreach (size_t searchSize; [ 0, 1, 2, 64 ])
2043         {
2044             foreach (size_t blockSize; [ 1, 2, 16, 64, 256 ])
2045             {
2046                 foreach (size_t readSize; [ 1, 2, 8, 32 ])
2047                 {
2048                     ubyte[] rawReadBuffer = new ubyte[readSize];
2049                     InputBlock[] blocks;
2050                     auto blocksAppender = appender(&blocks);
2051                     blocksAppender.reserve(3);
2052                     foreach (f; [ file1Path, file2Path, file3Path ])
2053                     {
2054                         auto ifile = f.File;
2055                         readFileDataAsMultipleBlocks(f, ifile, blocksAppender,
2056                                                      rawReadBuffer, blockSize, searchSize);
2057                     }
2058                     auto inputLines =
2059                         identifyInputLines!(No.hasRandomValue, No.isWeighted)(
2060                             blocks, cmdoptNoHeader);
2061 
2062                     assert(equal!((a, b) => a.data == b)(inputLines, expectedLines));
2063                 }
2064             }
2065         }
2066     }
2067     version(none) {
2068     {
2069         /* Reading as multiple blocks, with header processing. */
2070         const size_t readSize = 32;
2071         const size_t blockSize = 48;
2072         const size_t searchSize = 16;
2073 
2074         ubyte[] rawReadBuffer = new ubyte[readSize];
2075         InputBlock[] blocks;
2076         auto blocksAppender = appender(&blocks);
2077         blocksAppender.reserve(3);
2078         foreach (f; [ file1Path, file2Path, file3Path ])
2079         {
2080             auto ifile = f.File;
2081             readFileDataAsMultipleBlocks(f, ifile, blocksAppender,
2082                                          rawReadBuffer, blockSize, searchSize);
2083         }
2084         auto inputLines =
2085             identifyInputLines!(No.hasRandomValue, No.isWeighted)(
2086                 blocks, cmdoptYesHeader);
2087 
2088         assert(outputStream.data == expectedLinesUsingHeader[0] ~ '\n');
2089         assert(equal!((a, b) => a.data == b)(inputLines, expectedLinesUsingHeader[1 .. $]));
2090     }
2091     }
2092 }
2093 
2094 /** Write a floating point random value to an output stream.
2095  *
2096  * This routine is used for floating point random value printing. This routine writes
2097  * 17 significant digits, the range available in doubles. This routine prefers decimal
2098  * format, without exponents. It will generate somewhat large precision numbers,
2099  * currently up to 28 digits, before switching to exponents.
2100  *
2101  * The primary reason for this approach is to enable faster sorting on random values
2102  * by GNU sort and similar external sorting programs. GNU sort is dramatically faster
2103  * on decimal format numeric sorts ('n' switch) than general numeric sorts ('g' switch).
2104  * The 'general numeric' handles exponential notation. The difference is 5-10x.
2105  *
2106  * Random values generated by Bernoulli sampling are nearly always greater than 1e-12.
2107  * No examples less than 1e-09 were seen in hundred of millions of trials. Similar
2108  * results were seen with weighted sampling with integer weights. The same is not true
2109  * with floating point weights. These produce quite large exponents. However, even
2110  * for floating point weights this can be useful. For random weights [0,1] less than 5%
2111  * will be less than 1e-12 and use exponential notation.
2112  */
2113 void formatRandomValue(OutputRange)(auto ref OutputRange outputStream, double value)
2114 if (isOutputRange!(OutputRange, char))
2115 {
2116     import std.format : formatValue, singleSpec;
2117 
2118     immutable spec17f = singleSpec("%.17f");
2119     immutable spec18f = singleSpec("%.18f");
2120     immutable spec19f = singleSpec("%.19f");
2121     immutable spec20f = singleSpec("%.20f");
2122     immutable spec21f = singleSpec("%.21f");
2123     immutable spec22f = singleSpec("%.22f");
2124     immutable spec23f = singleSpec("%.23f");
2125     immutable spec24f = singleSpec("%.24f");
2126     immutable spec25f = singleSpec("%.25f");
2127     immutable spec26f = singleSpec("%.26f");
2128     immutable spec27f = singleSpec("%.27f");
2129     immutable spec28f = singleSpec("%.28f");
2130 
2131     immutable spec17g = singleSpec("%.17g");
2132 
2133     immutable formatSpec =
2134         (value >= 1e-01) ? spec17f :
2135         (value >= 1e-02) ? spec18f :
2136         (value >= 1e-03) ? spec19f :
2137         (value >= 1e-04) ? spec20f :
2138         (value >= 1e-05) ? spec21f :
2139         (value >= 1e-06) ? spec22f :
2140         (value >= 1e-07) ? spec23f :
2141         (value >= 1e-08) ? spec24f :
2142         (value >= 1e-09) ? spec25f :
2143         (value >= 1e-10) ? spec26f :
2144         (value >= 1e-11) ? spec27f :
2145         (value >= 1e-12) ? spec28f : spec17g;
2146 
2147     outputStream.formatValue(value, formatSpec);
2148 }
2149 
2150 @safe unittest
2151 {
2152     void testFormatValue(double value, string expected)
2153     {
2154         import std.array : appender;
2155 
2156         auto s = appender!string();
2157         s.formatRandomValue(value);
2158         assert(s.data == expected,
2159                format("[testFormatValue] value: %g; expected: %s; actual: %s", value, expected, s.data));
2160     }
2161 
2162     testFormatValue(1.0,   "1.00000000000000000");
2163     testFormatValue(0.1,   "0.10000000000000001");
2164     testFormatValue(0.01,  "0.010000000000000000");
2165     testFormatValue(1e-03, "0.0010000000000000000");
2166     testFormatValue(1e-04, "0.00010000000000000000");
2167     testFormatValue(1e-05, "0.000010000000000000001");
2168     testFormatValue(1e-06, "0.0000010000000000000000");
2169     testFormatValue(1e-07, "0.00000010000000000000000");
2170     testFormatValue(1e-08, "0.000000010000000000000000");
2171     testFormatValue(1e-09, "0.0000000010000000000000001");
2172     testFormatValue(1e-10, "0.00000000010000000000000000");
2173     testFormatValue(1e-11, "0.000000000009999999999999999");
2174     testFormatValue(1e-12, "0.0000000000010000000000000000");
2175     testFormatValue(1e-13, "1e-13");
2176     testFormatValue(1e-14, "1e-14");
2177     testFormatValue(12345678901234567e-15, "12.34567890123456735");
2178     testFormatValue(12345678901234567e-16, "1.23456789012345669");
2179     testFormatValue(12345678901234567e-17, "0.12345678901234566");
2180     testFormatValue(12345678901234567e-18, "0.012345678901234567");
2181     testFormatValue(12345678901234567e-19, "0.0012345678901234567");
2182     testFormatValue(12345678901234567e-20, "0.00012345678901234567");
2183     testFormatValue(12345678901234567e-21, "0.000012345678901234568");
2184     testFormatValue(12345678901234567e-22, "0.0000012345678901234567");
2185     testFormatValue(12345678901234567e-23, "0.00000012345678901234566");
2186     testFormatValue(12345678901234567e-24, "0.000000012345678901234567");
2187     testFormatValue(12345678901234567e-25, "0.0000000012345678901234566");
2188     testFormatValue(12345678901234567e-26, "0.00000000012345678901234568");
2189     testFormatValue(12345678901234567e-27, "0.000000000012345678901234567");
2190     testFormatValue(12345678901234567e-28, "0.0000000000012345678901234567");
2191     testFormatValue(12345678901234567e-29, "1.2345678901234566e-13");
2192 }
2193 
2194 
2195 /** Convenience function for extracting a single field from a line. See
2196  * [tsv_utils.common.utils.getTsvFieldValue] for details. This wrapper creates error
2197  * text tailored for this program.
2198  */
2199 import std.traits : isSomeChar;
2200 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, ulong lineNum) pure @safe
2201 if (isSomeChar!C)
2202 {
2203     import std.conv : ConvException, to;
2204     import tsv_utils.common.utils : getTsvFieldValue;
2205 
2206     T val;
2207     try
2208     {
2209         val = getTsvFieldValue!T(line, fieldIndex, delim);
2210     }
2211     catch (ConvException exc)
2212     {
2213         throw new Exception(
2214             format("Could not process line: %s\n  File: %s Line: %s%s",
2215                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
2216                    (lineNum == 1) ? "\n  Is this a header line? Use --H|header to skip." : ""));
2217     }
2218     catch (Exception exc)
2219     {
2220         /* Not enough fields on the line. */
2221         throw new Exception(
2222             format("Could not process line: %s\n  File: %s Line: %s",
2223                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum));
2224     }
2225 
2226     return val;
2227 }
2228 
2229 @safe unittest
2230 {
2231     /* getFieldValue unit tests. getTsvFieldValue has it's own tests.
2232      * These tests make basic sanity checks on the getFieldValue wrapper.
2233      */
2234     import std.exception;
2235 
2236     assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123);
2237     assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4);
2238     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1));
2239     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2));
2240     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1));
2241     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2));
2242 }
2243 
2244 /* Unit tests for the main program start here.
2245  *
2246  * Portability note: Many of the tests here rely on generating consistent random numbers
2247  * across different platforms when using the same random seed. So far this has succeeded
2248  * on several different platform, compiler, and library versions. However, it is certainly
2249  * possible this condition will not hold on other platforms.
2250  *
2251  * For tsv-sample, this portability implies generating the same results on different
2252  * platforms when using the same random seed. This is NOT part of tsv-sample guarantees,
2253  * but it is convenient for testing. If platforms are identified that do not generate
2254  * the same results these tests will need to be adjusted.
2255  */
2256 version(unittest)
2257 {
2258     /* Unit test helper functions. */
2259 
2260     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
2261     import std.conv : to;
2262 
2263     void testTsvSample(string[] cmdArgs, string[][] expected)
2264     {
2265         import std.array : appender;
2266 
2267         assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty.");
2268 
2269         auto formatAssertMessage(T...)(string msg, T formatArgs)
2270         {
2271             auto formatString = "[testTsvSample] %s: " ~ msg;
2272             return format(formatString, cmdArgs[0], formatArgs);
2273         }
2274 
2275         TsvSampleOptions cmdopt;
2276         auto savedCmdArgs = cmdArgs.to!string;
2277         auto r = cmdopt.processArgs(cmdArgs);
2278         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
2279         auto output = appender!(char[])();
2280 
2281         tsvSample(cmdopt, output);    // This invokes the main code line.
2282 
2283         auto expectedOutput = expected.tsvDataToString;
2284 
2285         assert(output.data == expectedOutput,
2286                formatAssertMessage(
2287                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
2288                    expectedOutput.to!string, output.data.to!string));
2289     }
2290  }
2291 
2292 unittest
2293 {
2294     import std.path : buildPath;
2295     import std.file : rmdirRecurse;
2296 
2297     auto testDir = makeUnittestTempDir("tsv_sample");
2298     scope(exit) testDir.rmdirRecurse;
2299 
2300     /* Tabular data sets and expected results use the built-in static seed.
2301      * Tests are run by writing the data set to a file, then calling the main
2302      * routine to process. The function testTsvSample plays the role of the
2303      * main program. Rather than writing to expected output, the results are
2304      * matched against expected. The expected results were verified by hand
2305      * prior to inclusion in the test.
2306      *
2307      * The initial part of this section is simply setting up data files and
2308      * expected results.
2309      *
2310      * Expected results naming conventions:
2311      *  - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected
2312      *  - Sampling Type (required): Permute (Shuffle), Sample, Replace, Bernoulli, Distinct
2313      *  - Compatibility: Compat, AlgoR, Skip, Swap, Inorder
2314      *  - Weight Field: Wt<num>, e.g. Wt3
2315      *  - Sample Size: Num<num>, eg. Num3
2316      *  - Seed Value: V<num>, eg. V77
2317      *  - Key Field: K<num>, e.g. K2
2318      *  - Probability: P<num>, e.g P05 (5%)
2319      *  - Printing Probabilities: Probs
2320      *  - Printing Probs in order: ProbsInorder
2321      *  - Printing Probs with custom header: RVCustom
2322      */
2323 
2324     /* Empty file. */
2325     string[][] dataEmpty = [];
2326     string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv");
2327     writeUnittestTsvFile(fpath_dataEmpty, dataEmpty);
2328 
2329     /* 3x0, header only. */
2330     string[][] data3x0 = [["field_a", "field_b", "field_c"]];
2331     string fpath_data3x0 = buildPath(testDir, "data3x0.tsv");
2332     writeUnittestTsvFile(fpath_data3x0, data3x0);
2333 
2334     /* 3x1 */
2335     string[][] data3x1 =
2336         [["field_a", "field_b", "field_c"],
2337          ["tan", "タン", "8.5"]];
2338 
2339     string fpath_data3x1 = buildPath(testDir, "data3x1.tsv");
2340     string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv");
2341     writeUnittestTsvFile(fpath_data3x1, data3x1);
2342     writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1 .. $]);
2343 
2344     string[][] data3x1ExpectedReplaceNum3 =
2345         [["field_a", "field_b", "field_c"],
2346          ["tan", "タン", "8.5"],
2347          ["tan", "タン", "8.5"],
2348          ["tan", "タン", "8.5"]];
2349 
2350     /* 3x2 */
2351     string[][] data3x2 =
2352         [["field_a", "field_b", "field_c"],
2353          ["brown", "褐色", "29.2"],
2354          ["gray", "グレー", "6.2"]];
2355 
2356     string fpath_data3x2 = buildPath(testDir, "data3x2.tsv");
2357     string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv");
2358     writeUnittestTsvFile(fpath_data3x2, data3x2);
2359     writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1 .. $]);
2360 
2361     string[][] data3x2PermuteCompat =
2362         [["field_a", "field_b", "field_c"],
2363          ["gray", "グレー", "6.2"],
2364          ["brown", "褐色", "29.2"]];
2365 
2366     string[][] data3x2PermuteShuffle =
2367         [["field_a", "field_b", "field_c"],
2368          ["gray", "グレー", "6.2"],
2369          ["brown", "褐色", "29.2"]];
2370 
2371     /* 3x3 */
2372     string[][] data3x3 =
2373         [["field_a", "field_b", "field_c"],
2374          ["orange", "オレンジ", "2.5"],
2375          ["pink", "ピンク", "1.1"],
2376          ["purple", "紫の", "42"]];
2377 
2378     string fpath_data3x3 = buildPath(testDir, "data3x3.tsv");
2379     string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv");
2380     writeUnittestTsvFile(fpath_data3x3, data3x3);
2381     writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1 .. $]);
2382 
2383     string[][] data3x3ExpectedPermuteCompat =
2384         [["field_a", "field_b", "field_c"],
2385          ["purple", "紫の", "42"],
2386          ["pink", "ピンク", "1.1"],
2387          ["orange", "オレンジ", "2.5"]];
2388 
2389     string[][] data3x3ExpectedPermuteSwap =
2390         [["field_a", "field_b", "field_c"],
2391          ["purple", "紫の", "42"],
2392          ["orange", "オレンジ", "2.5"],
2393          ["pink", "ピンク", "1.1"]];
2394 
2395     /* 3x6 */
2396     string[][] data3x6 =
2397         [["field_a", "field_b", "field_c"],
2398          ["red", "赤", "23.8"],
2399          ["green", "緑", "0.0072"],
2400          ["white", "白", "1.65"],
2401          ["yellow", "黄", "12"],
2402          ["blue", "青", "12"],
2403          ["black", "黒", "0.983"]];
2404     string fpath_data3x6 = buildPath(testDir, "data3x6.tsv");
2405     string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv");
2406     writeUnittestTsvFile(fpath_data3x6, data3x6);
2407     writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1 .. $]);
2408 
2409     // Randomization, all lines
2410     string[][] data3x6ExpectedPermuteCompat =
2411         [["field_a", "field_b", "field_c"],
2412          ["yellow", "黄", "12"],
2413          ["black", "黒", "0.983"],
2414          ["blue", "青", "12"],
2415          ["white", "白", "1.65"],
2416          ["green", "緑", "0.0072"],
2417          ["red", "赤", "23.8"]];
2418 
2419     string[][] data3x6ExpectedPermuteSwap =
2420         [["field_a", "field_b", "field_c"],
2421          ["black", "黒", "0.983"],
2422          ["green", "緑", "0.0072"],
2423          ["red", "赤", "23.8"],
2424          ["yellow", "黄", "12"],
2425          ["white", "白", "1.65"],
2426          ["blue", "青", "12"]];
2427 
2428     string[][] data3x6ExpectedPermuteCompatProbs =
2429         [["random_value", "field_a", "field_b", "field_c"],
2430          ["0.96055546286515892", "yellow", "黄", "12"],
2431          ["0.75710153928957880", "black", "黒", "0.983"],
2432          ["0.52525980887003243", "blue", "青", "12"],
2433          ["0.49287854949943721", "white", "白", "1.65"],
2434          ["0.15929344086907804", "green", "緑", "0.0072"],
2435          ["0.010968807619065046", "red", "赤", "23.8"]];
2436 
2437     /* Note: data3x6ExpectedSampleAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because
2438      * both are effectively the same algorithm given that --num is data length. Both read
2439      * in the full data in order then call randomShuffle.
2440      */
2441     string[][] data3x6ExpectedSampleAlgoRNum6 =
2442         [["field_a", "field_b", "field_c"],
2443          ["black", "黒", "0.983"],
2444          ["green", "緑", "0.0072"],
2445          ["red", "赤", "23.8"],
2446          ["yellow", "黄", "12"],
2447          ["white", "白", "1.65"],
2448          ["blue", "青", "12"]];
2449 
2450     string[][] data3x6ExpectedSampleAlgoRNum5 =
2451         [["field_a", "field_b", "field_c"],
2452          ["red", "赤", "23.8"],
2453          ["black", "黒", "0.983"],
2454          ["white", "白", "1.65"],
2455          ["green", "緑", "0.0072"],
2456          ["yellow", "黄", "12"]];
2457 
2458     string[][] data3x6ExpectedSampleAlgoRNum4 =
2459         [["field_a", "field_b", "field_c"],
2460          ["blue", "青", "12"],
2461          ["green", "緑", "0.0072"],
2462          ["black", "黒", "0.983"],
2463          ["white", "白", "1.65"]];
2464 
2465     string[][] data3x6ExpectedSampleAlgoRNum3 =
2466         [["field_a", "field_b", "field_c"],
2467          ["red", "赤", "23.8"],
2468          ["black", "黒", "0.983"],
2469          ["green", "緑", "0.0072"]];
2470 
2471     string[][] data3x6ExpectedSampleAlgoRNum2 =
2472         [["field_a", "field_b", "field_c"],
2473          ["black", "黒", "0.983"],
2474          ["red", "赤", "23.8"]];
2475 
2476     string[][] data3x6ExpectedSampleAlgoRNum1 =
2477         [["field_a", "field_b", "field_c"],
2478          ["green", "緑", "0.0072"]];
2479 
2480     /* Inorder versions. */
2481     string[][] data3x6ExpectedSampleAlgoRNum6Inorder =
2482         [["field_a", "field_b", "field_c"],
2483          ["red", "赤", "23.8"],
2484          ["green", "緑", "0.0072"],
2485          ["white", "白", "1.65"],
2486          ["yellow", "黄", "12"],
2487          ["blue", "青", "12"],
2488          ["black", "黒", "0.983"]];
2489 
2490     string[][] data3x6ExpectedSampleAlgoRNum5Inorder =
2491         [["field_a", "field_b", "field_c"],
2492          ["red", "赤", "23.8"],
2493          ["green", "緑", "0.0072"],
2494          ["white", "白", "1.65"],
2495          ["yellow", "黄", "12"],
2496          ["black", "黒", "0.983"]];
2497 
2498     string[][] data3x6ExpectedSampleAlgoRNum4Inorder =
2499         [["field_a", "field_b", "field_c"],
2500          ["green", "緑", "0.0072"],
2501          ["white", "白", "1.65"],
2502          ["blue", "青", "12"],
2503          ["black", "黒", "0.983"]];
2504 
2505     string[][] data3x6ExpectedSampleAlgoRNum3Inorder =
2506         [["field_a", "field_b", "field_c"],
2507          ["red", "赤", "23.8"],
2508          ["green", "緑", "0.0072"],
2509          ["black", "黒", "0.983"]];
2510 
2511     string[][] data3x6ExpectedSampleAlgoRNum2Inorder =
2512         [["field_a", "field_b", "field_c"],
2513          ["red", "赤", "23.8"],
2514          ["black", "黒", "0.983"]];
2515 
2516     string[][] data3x6ExpectedSampleAlgoRNum1Inorder =
2517         [["field_a", "field_b", "field_c"],
2518          ["green", "緑", "0.0072"]];
2519 
2520     /* Reservoir inorder */
2521     string[][] data3x6ExpectedSampleCompatNum6Inorder =
2522         [["field_a", "field_b", "field_c"],
2523          ["red", "赤", "23.8"],
2524          ["green", "緑", "0.0072"],
2525          ["white", "白", "1.65"],
2526          ["yellow", "黄", "12"],
2527          ["blue", "青", "12"],
2528          ["black", "黒", "0.983"]];
2529 
2530     string[][] data3x6ExpectedSampleCompatNum5Inorder =
2531         [["field_a", "field_b", "field_c"],
2532          ["green", "緑", "0.0072"],
2533          ["white", "白", "1.65"],
2534          ["yellow", "黄", "12"],
2535          ["blue", "青", "12"],
2536          ["black", "黒", "0.983"]];
2537 
2538     string[][] data3x6ExpectedSampleCompatNum4Inorder =
2539         [["field_a", "field_b", "field_c"],
2540          ["white", "白", "1.65"],
2541          ["yellow", "黄", "12"],
2542          ["blue", "青", "12"],
2543          ["black", "黒", "0.983"]];
2544 
2545     string[][] data3x6ExpectedSampleCompatNum3Inorder =
2546         [["field_a", "field_b", "field_c"],
2547          ["yellow", "黄", "12"],
2548          ["blue", "青", "12"],
2549          ["black", "黒", "0.983"]];
2550 
2551     string[][] data3x6ExpectedSampleCompatNum2Inorder =
2552         [["field_a", "field_b", "field_c"],
2553          ["yellow", "黄", "12"],
2554          ["black", "黒", "0.983"]];
2555 
2556     string[][] data3x6ExpectedSampleCompatNum1Inorder =
2557         [["field_a", "field_b", "field_c"],
2558          ["yellow", "黄", "12"]];
2559 
2560 
2561     /* Reservoir inorder with probabilities. */
2562     string[][] data3x6ExpectedSampleCompatNum6ProbsInorder =
2563         [["random_value", "field_a", "field_b", "field_c"],
2564          ["0.010968807619065046", "red", "赤", "23.8"],
2565          ["0.15929344086907804", "green", "緑", "0.0072"],
2566          ["0.49287854949943721", "white", "白", "1.65"],
2567          ["0.96055546286515892", "yellow", "黄", "12"],
2568          ["0.52525980887003243", "blue", "青", "12"],
2569          ["0.75710153928957880", "black", "黒", "0.983"]];
2570 
2571     string[][] data3x6ExpectedSampleCompatNum5ProbsInorder =
2572         [["random_value", "field_a", "field_b", "field_c"],
2573          ["0.15929344086907804", "green", "緑", "0.0072"],
2574          ["0.49287854949943721", "white", "白", "1.65"],
2575          ["0.96055546286515892", "yellow", "黄", "12"],
2576          ["0.52525980887003243", "blue", "青", "12"],
2577          ["0.75710153928957880", "black", "黒", "0.983"]];
2578 
2579     string[][] data3x6ExpectedSampleCompatNum4ProbsInorder =
2580         [["random_value", "field_a", "field_b", "field_c"],
2581          ["0.49287854949943721", "white", "白", "1.65"],
2582          ["0.96055546286515892", "yellow", "黄", "12"],
2583          ["0.52525980887003243", "blue", "青", "12"],
2584          ["0.75710153928957880", "black", "黒", "0.983"]];
2585 
2586     string[][] data3x6ExpectedSampleCompatNum3ProbsInorder =
2587         [["random_value", "field_a", "field_b", "field_c"],
2588          ["0.96055546286515892", "yellow", "黄", "12"],
2589          ["0.52525980887003243", "blue", "青", "12"],
2590          ["0.75710153928957880", "black", "黒", "0.983"]];
2591 
2592     string[][] data3x6ExpectedSampleCompatNum2ProbsInorder =
2593         [["random_value", "field_a", "field_b", "field_c"],
2594          ["0.96055546286515892", "yellow", "黄", "12"],
2595          ["0.75710153928957880", "black", "黒", "0.983"]];
2596 
2597     string[][] data3x6ExpectedSampleCompatNum1ProbsInorder =
2598         [["random_value", "field_a", "field_b", "field_c"],
2599          ["0.96055546286515892", "yellow", "黄", "12"]];
2600 
2601     string[][] data3x6ExpectedWt3Num6Inorder =
2602         [["field_a", "field_b", "field_c"],
2603          ["red", "赤", "23.8"],
2604          ["green", "緑", "0.0072"],
2605          ["white", "白", "1.65"],
2606          ["yellow", "黄", "12"],
2607          ["blue", "青", "12"],
2608          ["black", "黒", "0.983"]];
2609 
2610     string[][] data3x6ExpectedWt3Num5Inorder =
2611         [["field_a", "field_b", "field_c"],
2612          ["green", "緑", "0.0072"],
2613          ["white", "白", "1.65"],
2614          ["yellow", "黄", "12"],
2615          ["blue", "青", "12"],
2616          ["black", "黒", "0.983"]];
2617 
2618     string[][] data3x6ExpectedWt3Num4Inorder =
2619         [["field_a", "field_b", "field_c"],
2620          ["white", "白", "1.65"],
2621          ["yellow", "黄", "12"],
2622          ["blue", "青", "12"],
2623          ["black", "黒", "0.983"]];
2624 
2625     string[][] data3x6ExpectedWt3Num3Inorder =
2626         [["field_a", "field_b", "field_c"],
2627          ["yellow", "黄", "12"],
2628          ["blue", "青", "12"],
2629          ["black", "黒", "0.983"]];
2630 
2631     string[][] data3x6ExpectedWt3Num2Inorder =
2632         [["field_a", "field_b", "field_c"],
2633          ["yellow", "黄", "12"],
2634          ["black", "黒", "0.983"]];
2635 
2636     string[][] data3x6ExpectedWt3Num1Inorder =
2637         [["field_a", "field_b", "field_c"],
2638          ["yellow", "黄", "12"]];
2639 
2640 
2641     string[][] data3x6ExpectedBernoulliProbsP100 =
2642         [["random_value", "field_a", "field_b", "field_c"],
2643          ["0.010968807619065046", "red", "赤", "23.8"],
2644          ["0.15929344086907804", "green", "緑", "0.0072"],
2645          ["0.49287854949943721", "white", "白", "1.65"],
2646          ["0.96055546286515892", "yellow", "黄", "12"],
2647          ["0.52525980887003243", "blue", "青", "12"],
2648          ["0.75710153928957880", "black", "黒", "0.983"]];
2649 
2650     string[][] data3x6ExpectedBernoulliCompatProbsP60 =
2651         [["random_value", "field_a", "field_b", "field_c"],
2652          ["0.010968807619065046", "red", "赤", "23.8"],
2653          ["0.15929344086907804", "green", "緑", "0.0072"],
2654          ["0.49287854949943721", "white", "白", "1.65"],
2655          ["0.52525980887003243", "blue", "青", "12"]];
2656 
2657     string[][] data3x6ExpectedBernoulliSkipP40 =
2658         [["field_a", "field_b", "field_c"],
2659          ["red", "赤", "23.8"],
2660          ["green", "緑", "0.0072"],
2661          ["yellow", "黄", "12"]];
2662 
2663     string[][] data3x6ExpectedBernoulliCompatP60 =
2664         [["field_a", "field_b", "field_c"],
2665          ["red", "赤", "23.8"],
2666          ["green", "緑", "0.0072"],
2667          ["white", "白", "1.65"],
2668          ["blue", "青", "12"]];
2669 
2670     string[][] data3x6ExpectedDistinctK1K3P60 =
2671         [["field_a", "field_b", "field_c"],
2672          ["green", "緑", "0.0072"],
2673          ["white", "白", "1.65"],
2674          ["blue", "青", "12"]];
2675 
2676     string[][] data3x6ExpectedDistinctK1K3P60Probs =
2677         [["random_value", "field_a", "field_b", "field_c"],
2678          ["0", "green", "緑", "0.0072"],
2679          ["0", "white", "白", "1.65"],
2680          ["0", "blue", "青", "12"]];
2681 
2682     string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom =
2683         [["custom_random_value_header", "field_a", "field_b", "field_c"],
2684          ["0", "green", "緑", "0.0072"],
2685          ["0", "white", "白", "1.65"],
2686          ["0", "blue", "青", "12"]];
2687 
2688     string[][] data3x6ExpectedDistinctK2P2ProbsInorder =
2689         [["random_value", "field_a", "field_b", "field_c"],
2690          ["1", "red", "赤", "23.8"],
2691          ["0", "green", "緑", "0.0072"],
2692          ["0", "white", "白", "1.65"],
2693          ["1", "yellow", "黄", "12"],
2694          ["3", "blue", "青", "12"],
2695          ["2", "black", "黒", "0.983"]];
2696 
2697     string[][] data3x6ExpectedPermuteWt3Probs =
2698         [["random_value", "field_a", "field_b", "field_c"],
2699          ["0.99665198757645390", "yellow", "黄", "12"],
2700          ["0.94775884809836686", "blue", "青", "12"],
2701          ["0.82728234682286661", "red", "赤", "23.8"],
2702          ["0.75346697377181959", "black", "黒", "0.983"],
2703          ["0.65130103496422487", "white", "白", "1.65"],
2704          ["1.5636943712879866e-111", "green", "緑", "0.0072"]];
2705 
2706     string[][] data3x6ExpectedWt3ProbsInorder =
2707         [["random_value", "field_a", "field_b", "field_c"],
2708          ["0.82728234682286661", "red", "赤", "23.8"],
2709          ["1.5636943712879866e-111", "green", "緑", "0.0072"],
2710          ["0.65130103496422487", "white", "白", "1.65"],
2711          ["0.99665198757645390", "yellow", "黄", "12"],
2712          ["0.94775884809836686", "blue", "青", "12"],
2713          ["0.75346697377181959", "black", "黒", "0.983"]];
2714 
2715     string[][] data3x6ExpectedPermuteWt3 =
2716         [["field_a", "field_b", "field_c"],
2717          ["yellow", "黄", "12"],
2718          ["blue", "青", "12"],
2719          ["red", "赤", "23.8"],
2720          ["black", "黒", "0.983"],
2721          ["white", "白", "1.65"],
2722          ["green", "緑", "0.0072"]];
2723 
2724 
2725     string[][] data3x6ExpectedReplaceNum10 =
2726         [["field_a", "field_b", "field_c"],
2727          ["black", "黒", "0.983"],
2728          ["green", "緑", "0.0072"],
2729          ["green", "緑", "0.0072"],
2730          ["red", "赤", "23.8"],
2731          ["yellow", "黄", "12"],
2732          ["red", "赤", "23.8"],
2733          ["white", "白", "1.65"],
2734          ["yellow", "黄", "12"],
2735          ["yellow", "黄", "12"],
2736          ["white", "白", "1.65"],
2737         ];
2738 
2739     string[][] data3x6ExpectedReplaceNum10V77 =
2740         [["field_a", "field_b", "field_c"],
2741          ["black", "黒", "0.983"],
2742          ["red", "赤", "23.8"],
2743          ["black", "黒", "0.983"],
2744          ["yellow", "黄", "12"],
2745          ["green", "緑", "0.0072"],
2746          ["green", "緑", "0.0072"],
2747          ["green", "緑", "0.0072"],
2748          ["yellow", "黄", "12"],
2749          ["blue", "青", "12"],
2750          ["white", "白", "1.65"],
2751         ];
2752 
2753     /* Using a different static seed. */
2754     string[][] data3x6ExpectedPermuteCompatV41Probs =
2755         [["random_value", "field_a", "field_b", "field_c"],
2756          ["0.68057272653095424", "green", "緑", "0.0072"],
2757          ["0.67681624367833138", "blue", "青", "12"],
2758          ["0.32097338931635022", "yellow", "黄", "12"],
2759          ["0.25092361867427826", "red", "赤", "23.8"],
2760          ["0.15535934292711318", "black", "黒", "0.983"],
2761          ["0.046095821075141430", "white", "白", "1.65"]];
2762 
2763     string[][] data3x6ExpectedBernoulliCompatP60V41Probs =
2764         [["random_value", "field_a", "field_b", "field_c"],
2765          ["0.25092361867427826", "red", "赤", "23.8"],
2766          ["0.046095821075141430", "white", "白", "1.65"],
2767          ["0.32097338931635022", "yellow", "黄", "12"],
2768          ["0.15535934292711318", "black", "黒", "0.983"]];
2769 
2770     string[][] data3x6ExpectedPermuteWt3V41Probs =
2771         [["random_value", "field_a", "field_b", "field_c"],
2772          ["0.96799377498910666", "blue", "青", "12"],
2773          ["0.94356245792573568", "red", "赤", "23.8"],
2774          ["0.90964601024271996", "yellow", "黄", "12"],
2775          ["0.15491658409260103", "white", "白", "1.65"],
2776          ["0.15043620392537033", "black", "黒", "0.983"],
2777          ["6.1394674830701461e-24", "green", "緑", "0.0072"]];
2778 
2779     string[][] data3x6ExpectedWt3V41ProbsInorder =
2780         [["random_value", "field_a", "field_b", "field_c"],
2781          ["0.94356245792573568", "red", "赤", "23.8"],
2782          ["6.1394674830701461e-24", "green", "緑", "0.0072"],
2783          ["0.15491658409260103", "white", "白", "1.65"],
2784          ["0.90964601024271996", "yellow", "黄", "12"],
2785          ["0.96799377498910666", "blue", "青", "12"],
2786          ["0.15043620392537033", "black", "黒", "0.983"]];
2787 
2788 
2789     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
2790     string[][] combo1ExpectedPermuteCompat =
2791         [["field_a", "field_b", "field_c"],
2792          ["yellow", "黄", "12"],
2793          ["tan", "タン", "8.5"],
2794          ["brown", "褐色", "29.2"],
2795          ["green", "緑", "0.0072"],
2796          ["red", "赤", "23.8"],
2797          ["purple", "紫の", "42"],
2798          ["black", "黒", "0.983"],
2799          ["white", "白", "1.65"],
2800          ["gray", "グレー", "6.2"],
2801          ["blue", "青", "12"],
2802          ["pink", "ピンク", "1.1"],
2803          ["orange", "オレンジ", "2.5"]];
2804 
2805     string[][] combo1ExpectedPermuteCompatProbs =
2806         [["random_value", "field_a", "field_b", "field_c"],
2807          ["0.97088520275428891", "yellow", "黄", "12"],
2808          ["0.96055546286515892", "tan", "タン", "8.5"],
2809          ["0.81756894313730299", "brown", "褐色", "29.2"],
2810          ["0.75710153928957880", "green", "緑", "0.0072"],
2811          ["0.52525980887003243", "red", "赤", "23.8"],
2812          ["0.49287854949943721", "purple", "紫の", "42"],
2813          ["0.47081507067196071", "black", "黒", "0.983"],
2814          ["0.38388182921335101", "white", "白", "1.65"],
2815          ["0.29215990612283349", "gray", "グレー", "6.2"],
2816          ["0.24033216014504433", "blue", "青", "12"],
2817          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2818          ["0.010968807619065046", "orange", "オレンジ", "2.5"]];
2819 
2820     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
2821     string[][] combo1ExpectedProbsInorder =
2822         [["random_value", "field_a", "field_b", "field_c"],
2823          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
2824          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2825          ["0.49287854949943721", "purple", "紫の", "42"],
2826          ["0.96055546286515892", "tan", "タン", "8.5"],
2827          ["0.52525980887003243", "red", "赤", "23.8"],
2828          ["0.75710153928957880", "green", "緑", "0.0072"],
2829          ["0.38388182921335101", "white", "白", "1.65"],
2830          ["0.97088520275428891", "yellow", "黄", "12"],
2831          ["0.24033216014504433", "blue", "青", "12"],
2832          ["0.47081507067196071", "black", "黒", "0.983"],
2833          ["0.81756894313730299", "brown", "褐色", "29.2"],
2834          ["0.29215990612283349", "gray", "グレー", "6.2"]];
2835 
2836     string[][] combo1ExpectedBernoulliCompatP50Probs =
2837         [["random_value", "field_a", "field_b", "field_c"],
2838          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
2839          ["0.15929344086907804", "pink", "ピンク", "1.1"],
2840          ["0.49287854949943721", "purple", "紫の", "42"],
2841          ["0.38388182921335101", "white", "白", "1.65"],
2842          ["0.24033216014504433", "blue", "青", "12"],
2843          ["0.47081507067196071", "black", "黒", "0.983"],
2844          ["0.29215990612283349", "gray", "グレー", "6.2"]];
2845 
2846     string[][] combo1ExpectedBernoulliCompatP40 =
2847         [["field_a", "field_b", "field_c"],
2848          ["orange", "オレンジ", "2.5"],
2849          ["pink", "ピンク", "1.1"],
2850          ["white", "白", "1.65"],
2851          ["blue", "青", "12"],
2852          ["gray", "グレー", "6.2"]];
2853 
2854     string[][] combo1ExpectedDistinctK1P40 =
2855         [["field_a", "field_b", "field_c"],
2856          ["orange", "オレンジ", "2.5"],
2857          ["red", "赤", "23.8"],
2858          ["green", "緑", "0.0072"],
2859          ["blue", "青", "12"],
2860          ["black", "黒", "0.983"]];
2861 
2862     string[][] combo1ExpectedPermuteWt3Probs =
2863         [["random_value", "field_a", "field_b", "field_c"],
2864          ["0.99754077523718754", "yellow", "黄", "12"],
2865          ["0.99527665440088786", "tan", "タン", "8.5"],
2866          ["0.99312578945741659", "brown", "褐色", "29.2"],
2867          ["0.98329602553389361", "purple", "紫の", "42"],
2868          ["0.97330961938083660", "red", "赤", "23.8"],
2869          ["0.88797551521739648", "blue", "青", "12"],
2870          ["0.81999230489041786", "gray", "グレー", "6.2"],
2871          ["0.55975569204250941", "white", "白", "1.65"],
2872          ["0.46472135609205739", "black", "黒", "0.983"],
2873          ["0.18824582704191337", "pink", "ピンク", "1.1"],
2874          ["0.16446131853299920", "orange", "オレンジ", "2.5"],
2875          ["1.6438086931020549e-17", "green", "緑", "0.0072"]];
2876 
2877     string[][] combo1ExpectedPermuteWt3 =
2878         [["field_a", "field_b", "field_c"],
2879          ["yellow", "黄", "12"],
2880          ["tan", "タン", "8.5"],
2881          ["brown", "褐色", "29.2"],
2882          ["purple", "紫の", "42"],
2883          ["red", "赤", "23.8"],
2884          ["blue", "青", "12"],
2885          ["gray", "グレー", "6.2"],
2886          ["white", "白", "1.65"],
2887          ["black", "黒", "0.983"],
2888          ["pink", "ピンク", "1.1"],
2889          ["orange", "オレンジ", "2.5"],
2890          ["green", "緑", "0.0072"]];
2891 
2892         string[][] combo1ExpectedSampleAlgoRNum4 =
2893         [["field_a", "field_b", "field_c"],
2894          ["blue", "青", "12"],
2895          ["gray", "グレー", "6.2"],
2896          ["brown", "褐色", "29.2"],
2897          ["white", "白", "1.65"]];
2898 
2899         string[][] combo1ExpectedSampleAlgoRNum4Inorder =
2900         [["field_a", "field_b", "field_c"],
2901          ["white", "白", "1.65"],
2902          ["blue", "青", "12"],
2903          ["brown", "褐色", "29.2"],
2904          ["gray", "グレー", "6.2"]];
2905 
2906     string[][] combo1ExpectedReplaceNum10 =
2907         [["field_a", "field_b", "field_c"],
2908          ["gray", "グレー", "6.2"],
2909          ["yellow", "黄", "12"],
2910          ["yellow", "黄", "12"],
2911          ["white", "白", "1.65"],
2912          ["tan", "タン", "8.5"],
2913          ["white", "白", "1.65"],
2914          ["blue", "青", "12"],
2915          ["black", "黒", "0.983"],
2916          ["tan", "タン", "8.5"],
2917          ["purple", "紫の", "42"]];
2918 
2919     /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */
2920     string[][] data1x200 =
2921         [["field_a"],
2922          ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"],
2923          ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"],
2924          ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"],
2925          ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"],
2926          ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"],
2927          ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"],
2928          ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"],
2929          ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"],
2930          ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"],
2931          ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"],
2932          ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"],
2933          ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"],
2934          ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"],
2935          ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"],
2936          ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"],
2937          ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"],
2938          ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"],
2939          ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"],
2940          ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"],
2941          ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"],
2942         ];
2943 
2944     string fpath_data1x200 = buildPath(testDir, "data1x200.tsv");
2945     string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv");
2946     writeUnittestTsvFile(fpath_data1x200, data1x200);
2947     writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1 .. $]);
2948 
2949     string[][] data1x200ExpectedBernoulliSkipV333P01 =
2950         [["field_a"],
2951          ["077"],
2952          ["119"]];
2953 
2954     string[][] data1x200ExpectedBernoulliSkipV333P02 =
2955         [["field_a"],
2956          ["038"],
2957          ["059"],
2958          ["124"],
2959          ["161"],
2960          ["162"],
2961          ["183"]];
2962 
2963     string[][] data1x200ExpectedBernoulliSkipV333P03 =
2964         [["field_a"],
2965          ["025"],
2966          ["039"],
2967          ["082"],
2968          ["107"],
2969          ["108"],
2970          ["122"],
2971          ["136"],
2972          ["166"],
2973          ["182"]];
2974 
2975     string[][] data1x200ExpectedBernoulliCompatV333P01 =
2976         [["field_a"],
2977          ["072"]];
2978 
2979     string[][] data1x200ExpectedBernoulliCompatV333P02 =
2980         [["field_a"],
2981          ["004"],
2982          ["072"]];
2983 
2984     string[][] data1x200ExpectedBernoulliCompatV333P03 =
2985         [["field_a"],
2986          ["004"],
2987          ["072"],
2988          ["181"]];
2989 
2990     /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files,
2991      * only expected results. The header is from 3x0, the results are offset 1-position
2992      * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line.
2993      */
2994     string[][] combo2ExpectedBernoulliSkipV333P03 =
2995         [["field_a", "field_b", "field_c"],
2996          ["024"],
2997          ["038"],
2998          ["081"],
2999          ["106"],
3000          ["107"],
3001          ["121"],
3002          ["135"],
3003          ["165"],
3004          ["181"]];
3005 
3006 
3007     /* 1x10 - Simple 1-column file. */
3008     string[][] data1x10 =
3009         [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]];
3010     string fpath_data1x10 = buildPath(testDir, "data1x10.tsv");
3011     string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv");
3012     writeUnittestTsvFile(fpath_data1x10, data1x10);
3013     writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1 .. $]);
3014 
3015     string[][] data1x10ExpectedPermuteCompat =
3016         [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]];
3017 
3018     string[][] data1x10ExpectedPermuteWt1 =
3019         [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]];
3020 
3021     /* 2x10a - Uniform distribution [0,1]. */
3022     string[][] data2x10a =
3023         [["line", "weight"],
3024          ["1", "0.26788837"],
3025          ["2", "0.06601298"],
3026          ["3", "0.38627527"],
3027          ["4", "0.47379424"],
3028          ["5", "0.02966641"],
3029          ["6", "0.05636231"],
3030          ["7", "0.70529242"],
3031          ["8", "0.91836862"],
3032          ["9", "0.99103720"],
3033          ["10", "0.31401740"]];
3034 
3035     string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv");
3036     writeUnittestTsvFile(fpath_data2x10a, data2x10a);
3037 
3038     string[][] data2x10aExpectedPermuteWt2Probs =
3039         [["random_value", "line", "weight"],
3040          ["0.96833865494543658", "8", "0.91836862"],
3041          ["0.91856842054413923", "4", "0.47379424"],
3042          ["0.25730832087795091", "7", "0.70529242"],
3043          ["0.23725317907018120", "9", "0.99103720"],
3044          ["0.16016096701872204", "3", "0.38627527"],
3045          ["0.090819662667243381", "10", "0.31401740"],
3046          ["0.0071764539244361172", "6", "0.05636231"],
3047          ["0.000000048318642951630057", "1", "0.26788837"],
3048          ["0.00000000037525692966535517", "5", "0.02966641"],
3049          ["8.2123247880095796e-13", "2", "0.06601298"]];
3050 
3051     /* 2x10b - Uniform distribution [0,1000]. */
3052     string[][] data2x10b =
3053         [["line", "weight"],
3054          ["1", "761"],
3055          ["2", "432"],
3056          ["3", "103"],
3057          ["4", "448"],
3058          ["5", "750"],
3059          ["6", "711"],
3060          ["7", "867"],
3061          ["8", "841"],
3062          ["9", "963"],
3063          ["10", "784"]];
3064 
3065     string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv");
3066     writeUnittestTsvFile(fpath_data2x10b, data2x10b);
3067 
3068     string[][] data2x10bExpectedPermuteWt2Probs =
3069         [["random_value", "line", "weight"],
3070          ["0.99996486739067969", "8", "841"],
3071          ["0.99991017467137211", "4", "448"],
3072          ["0.99960871524873662", "6", "711"],
3073          ["0.99914188537143800", "5", "750"],
3074          ["0.99903963250274785", "10", "784"],
3075          ["0.99889631825931946", "7", "867"],
3076          ["0.99852058315191139", "9", "963"],
3077          ["0.99575669679158918", "2", "432"],
3078          ["0.99408758732050595", "1", "761"],
3079          ["0.99315467761212362", "3", "103"]];
3080 
3081     /* 2x10c - Logarithmic distribution in random order. */
3082     string[][] data2x10c =
3083         [["line", "weight"],
3084          ["1", "31.85"],
3085          ["2", "17403.31"],
3086          ["3", "653.84"],
3087          ["4", "8.23"],
3088          ["5", "2671.04"],
3089          ["6", "26226.08"],
3090          ["7", "1.79"],
3091          ["8", "354.56"],
3092          ["9", "35213.81"],
3093          ["10", "679.29"]];
3094 
3095     string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv");
3096     writeUnittestTsvFile(fpath_data2x10c, data2x10c);
3097 
3098     string[][] data2x10cExpectedPermuteWt2Probs =
3099         [["random_value", "line", "weight"],
3100          ["0.99998939008709697", "6", "26226.08"],
3101          ["0.99995951291695517", "9", "35213.81"],
3102          ["0.99991666907613541", "8", "354.56"],
3103          ["0.99989445052186410", "2", "17403.31"],
3104          ["0.99975897602861630", "5", "2671.04"],
3105          ["0.99891852769877643", "3", "653.84"],
3106          ["0.99889167752782515", "10", "679.29"],
3107          ["0.99512207506850148", "4", "8.23"],
3108          ["0.86789371584259023", "1", "31.85"],
3109          ["0.58574438162915610", "7", "1.79"]];
3110 
3111     /* 2x10d. Logarithmic distribution in ascending order. */
3112     string[][] data2x10d =
3113         [["line", "weight"],
3114          ["1", "1.79"],
3115          ["2", "8.23"],
3116          ["3", "31.85"],
3117          ["4", "354.56"],
3118          ["5", "653.84"],
3119          ["6", "679.29"],
3120          ["7", "2671.04"],
3121          ["8", "17403.31"],
3122          ["9", "26226.08"],
3123          ["10", "35213.81"]];
3124 
3125     string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv");
3126     writeUnittestTsvFile(fpath_data2x10d, data2x10d);
3127 
3128     string[][] data2x10dExpectedPermuteWt2Probs =
3129         [["random_value", "line", "weight"],
3130          ["0.99999830221846353", "8", "17403.31"],
3131          ["0.99997860834041397", "10", "35213.81"],
3132          ["0.99994563828986716", "9", "26226.08"],
3133          ["0.99988650363575737", "4", "354.56"],
3134          ["0.99964161939190088", "7", "2671.04"],
3135          ["0.99959045338948649", "6", "679.29"],
3136          ["0.99901574490639788", "5", "653.84"],
3137          ["0.97803163304747431", "3", "31.85"],
3138          ["0.79994791806910948", "2", "8.23"],
3139          ["0.080374261239949119", "1", "1.79"]];
3140 
3141     /* 2x10e. Logarithmic distribution in descending order. */
3142     string[][] data2x10e =
3143         [["line", "weight"],
3144          ["1", "35213.81"],
3145          ["2", "26226.08"],
3146          ["3", "17403.31"],
3147          ["4", "2671.04"],
3148          ["5", "679.29"],
3149          ["6", "653.84"],
3150          ["7", "354.56"],
3151          ["8", "31.85"],
3152          ["9", "8.23"],
3153          ["10", "1.79"]];
3154     string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv");
3155     writeUnittestTsvFile(fpath_data2x10e, data2x10e);
3156 
3157     string[][] data2x10eExpectedPermuteWt2Probs =
3158         [["random_value", "line", "weight"],
3159          ["0.99998493348975237", "4", "2671.04"],
3160          ["0.99995934807202624", "3", "17403.31"],
3161          ["0.99992995739727453", "2", "26226.08"],
3162          ["0.99987185679245649", "1", "35213.81"],
3163          ["0.99957451563173938", "6", "653.84"],
3164          ["0.99907273650209583", "8", "31.85"],
3165          ["0.99905260312968946", "5", "679.29"],
3166          ["0.99730333650516401", "7", "354.56"],
3167          ["0.84093902435227808", "9", "8.23"],
3168          ["0.65650015926290028", "10", "1.79"]];
3169 
3170     /* Data sets for distinct sampling. */
3171     string[][] data5x25 =
3172         [["ID", "Shape", "Color", "Size", "Weight"],
3173          ["01", "circle", "red", "S", "10"],
3174          ["02", "circle", "black", "L", "20"],
3175          ["03", "square", "black", "L", "20"],
3176          ["04", "circle", "green", "L", "30"],
3177          ["05", "ellipse", "red", "S", "20"],
3178          ["06", "triangle", "red", "S", "10"],
3179          ["07", "triangle", "red", "L", "20"],
3180          ["08", "square", "black", "S", "10"],
3181          ["09", "circle", "black", "S", "20"],
3182          ["10", "square", "green", "L", "20"],
3183          ["11", "triangle", "red", "L", "20"],
3184          ["12", "circle", "green", "L", "30"],
3185          ["13", "ellipse", "red", "S", "20"],
3186          ["14", "circle", "green", "L", "30"],
3187          ["15", "ellipse", "red", "L", "30"],
3188          ["16", "square", "red", "S", "10"],
3189          ["17", "circle", "black", "L", "20"],
3190          ["18", "square", "red", "S", "20"],
3191          ["19", "square", "black", "L", "20"],
3192          ["20", "circle", "red", "S", "10"],
3193          ["21", "ellipse", "black", "L", "30"],
3194          ["22", "triangle", "red", "L", "30"],
3195          ["23", "circle", "green", "S", "20"],
3196          ["24", "square", "green", "L", "20"],
3197          ["25", "circle", "red", "S", "10"],
3198         ];
3199 
3200     string fpath_data5x25 = buildPath(testDir, "data5x25.tsv");
3201     string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv");
3202     writeUnittestTsvFile(fpath_data5x25, data5x25);
3203     writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1 .. $]);
3204 
3205     string[][] data5x25ExpectedDistinctK2P40 =
3206         [["ID", "Shape", "Color", "Size", "Weight"],
3207          ["03", "square", "black", "L", "20"],
3208          ["05", "ellipse", "red", "S", "20"],
3209          ["08", "square", "black", "S", "10"],
3210          ["10", "square", "green", "L", "20"],
3211          ["13", "ellipse", "red", "S", "20"],
3212          ["15", "ellipse", "red", "L", "30"],
3213          ["16", "square", "red", "S", "10"],
3214          ["18", "square", "red", "S", "20"],
3215          ["19", "square", "black", "L", "20"],
3216          ["21", "ellipse", "black", "L", "30"],
3217          ["24", "square", "green", "L", "20"],
3218         ];
3219 
3220     string[][] data5x25ExpectedDistinctK2K4P20 =
3221         [["ID", "Shape", "Color", "Size", "Weight"],
3222          ["03", "square", "black", "L", "20"],
3223          ["07", "triangle", "red", "L", "20"],
3224          ["08", "square", "black", "S", "10"],
3225          ["10", "square", "green", "L", "20"],
3226          ["11", "triangle", "red", "L", "20"],
3227          ["16", "square", "red", "S", "10"],
3228          ["18", "square", "red", "S", "20"],
3229          ["19", "square", "black", "L", "20"],
3230          ["22", "triangle", "red", "L", "30"],
3231          ["24", "square", "green", "L", "20"],
3232         ];
3233 
3234     string[][] data5x25ExpectedDistinctK2K3K4P20 =
3235         [["ID", "Shape", "Color", "Size", "Weight"],
3236          ["04", "circle", "green", "L", "30"],
3237          ["07", "triangle", "red", "L", "20"],
3238          ["09", "circle", "black", "S", "20"],
3239          ["11", "triangle", "red", "L", "20"],
3240          ["12", "circle", "green", "L", "30"],
3241          ["14", "circle", "green", "L", "30"],
3242          ["16", "square", "red", "S", "10"],
3243          ["18", "square", "red", "S", "20"],
3244          ["22", "triangle", "red", "L", "30"],
3245         ];
3246 
3247     /* Fields 2 and 4 from data5x25. Distinct rows should be the same for equiv keys. */
3248     string[][] data2x25 =
3249         [["Shape", "Size"],
3250          ["circle", "S"],
3251          ["circle", "L"],
3252          ["square", "L"],
3253          ["circle", "L"],
3254          ["ellipse", "S"],
3255          ["triangle", "S"],
3256          ["triangle", "L"],
3257          ["square", "S"],
3258          ["circle", "S"],
3259          ["square", "L"],
3260          ["triangle", "L"],
3261          ["circle", "L"],
3262          ["ellipse", "S"],
3263          ["circle", "L"],
3264          ["ellipse", "L"],
3265          ["square", "S"],
3266          ["circle", "L"],
3267          ["square", "S"],
3268          ["square", "L"],
3269          ["circle", "S"],
3270          ["ellipse", "L"],
3271          ["triangle", "L"],
3272          ["circle", "S"],
3273          ["square", "L"],
3274          ["circle", "S"],
3275         ];
3276 
3277     string fpath_data2x25 = buildPath(testDir, "data2x25.tsv");
3278     string fpath_data2x25_noheader = buildPath(testDir, "data2x25_noheader.tsv");
3279     writeUnittestTsvFile(fpath_data2x25, data2x25);
3280     writeUnittestTsvFile(fpath_data2x25_noheader, data2x25[1 .. $]);
3281 
3282     string[][] data2x25ExpectedDistinctK1K2P20 =
3283         [["Shape", "Size"],
3284          ["square", "L"],
3285          ["triangle", "L"],
3286          ["square", "S"],
3287          ["square", "L"],
3288          ["triangle", "L"],
3289          ["square", "S"],
3290          ["square", "S"],
3291          ["square", "L"],
3292          ["triangle", "L"],
3293          ["square", "L"],
3294         ];
3295 
3296     string[][] data1x25 =
3297         [["Shape-Size"],
3298          ["circle-S"],
3299          ["circle-L"],
3300          ["square-L"],
3301          ["circle-L"],
3302          ["ellipse-S"],
3303          ["triangle-S"],
3304          ["triangle-L"],
3305          ["square-S"],
3306          ["circle-S"],
3307          ["square-L"],
3308          ["triangle-L"],
3309          ["circle-L"],
3310          ["ellipse-S"],
3311          ["circle-L"],
3312          ["ellipse-L"],
3313          ["square-S"],
3314          ["circle-L"],
3315          ["square-S"],
3316          ["square-L"],
3317          ["circle-S"],
3318          ["ellipse-L"],
3319          ["triangle-L"],
3320          ["circle-S"],
3321          ["square-L"],
3322          ["circle-S"],
3323         ];
3324 
3325     string fpath_data1x25 = buildPath(testDir, "data1x25.tsv");
3326     string fpath_data1x25_noheader = buildPath(testDir, "data1x25_noheader.tsv");
3327     writeUnittestTsvFile(fpath_data1x25, data1x25);
3328     writeUnittestTsvFile(fpath_data1x25_noheader, data1x25[1 .. $]);
3329 
3330     string[][] data1x25ExpectedDistinctK1P20 =
3331         [["Shape-Size"],
3332          ["triangle-L"],
3333          ["square-S"],
3334          ["triangle-L"],
3335          ["ellipse-L"],
3336          ["square-S"],
3337          ["square-S"],
3338          ["ellipse-L"],
3339          ["triangle-L"],
3340         ];
3341 
3342     string[][] data1x25ExpectedDistinctK1P20Probs =
3343         [["random_value", "Shape-Size"],
3344          ["0", "triangle-L"],
3345          ["0", "square-S"],
3346          ["0", "triangle-L"],
3347          ["0", "ellipse-L"],
3348          ["0", "square-S"],
3349          ["0", "square-S"],
3350          ["0", "ellipse-L"],
3351          ["0", "triangle-L"],
3352         ];
3353 
3354     string[][] data1x25ExpectedDistinctK1P20ProbsInorder =
3355         [["random_value", "Shape-Size"],
3356          ["1", "circle-S"],
3357          ["4", "circle-L"],
3358          ["2", "square-L"],
3359          ["4", "circle-L"],
3360          ["2", "ellipse-S"],
3361          ["1", "triangle-S"],
3362          ["0", "triangle-L"],
3363          ["0", "square-S"],
3364          ["1", "circle-S"],
3365          ["2", "square-L"],
3366          ["0", "triangle-L"],
3367          ["4", "circle-L"],
3368          ["2", "ellipse-S"],
3369          ["4", "circle-L"],
3370          ["0", "ellipse-L"],
3371          ["0", "square-S"],
3372          ["4", "circle-L"],
3373          ["0", "square-S"],
3374          ["2", "square-L"],
3375          ["1", "circle-S"],
3376          ["0", "ellipse-L"],
3377          ["0", "triangle-L"],
3378          ["1", "circle-S"],
3379          ["2", "square-L"],
3380          ["1", "circle-S"],
3381         ];
3382 
3383     /*
3384      * Enough setup! Actually run some tests!
3385      */
3386 
3387     /* Shuffling tests. Headers, static seed, compatibility mode. With weights and without. */
3388     testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty);
3389     testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0);
3390     testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1);
3391     testTsvSample(["test-a4", "-H", "-s", "--compatibility-mode", fpath_data3x2], data3x2PermuteCompat);
3392     testTsvSample(["test-a5", "-H", "-s", "--compatibility-mode", fpath_data3x3], data3x3ExpectedPermuteCompat);
3393     testTsvSample(["test-a6", "-H", "-s", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat);
3394     testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
3395     testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3);
3396     testTsvSample(["test-a8b", "-H", "-s", "--weight-field", "field_c", fpath_data3x6], data3x6ExpectedPermuteWt3);
3397     testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
3398     testTsvSample(["test-a9b", "-H", "-s", "--print-random", "-w", "field_c", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
3399     testTsvSample(["test-a9c", "-H", "-s", "--print-random", "-w", "f*c", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
3400     testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
3401     testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
3402     testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs);
3403     testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs);
3404     testTsvSample(["test-a13b", "-H", "-v", "41", "-w", "field_c", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs);
3405 
3406     /* Shuffling, without compatibility mode, or with both compatibility and printing. */
3407     testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty);
3408     testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0);
3409     testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1);
3410     testTsvSample(["test-aa4", "-H", "-s", fpath_data3x2], data3x2PermuteShuffle);
3411     testTsvSample(["test-aa5", "-H", "-s", fpath_data3x3], data3x3ExpectedPermuteSwap);
3412     testTsvSample(["test-aa6", "-H", "-s", fpath_data3x6], data3x6ExpectedPermuteSwap);
3413     testTsvSample(["test-aa7", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3);
3414     testTsvSample(["test-aa8", "-H", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
3415     testTsvSample(["test-aa8b", "-H", "-s", "--print-random", "-w", "field_c", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs);
3416     testTsvSample(["test-aa9", "-H", "--seed-value", "41", "--print-random", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs);
3417 
3418     /* Reservoir sampling using Algorithm R.
3419      * (Note: reservoirSamplingViaHeap is tested later in the length-based iteration loops.)
3420      */
3421     testTsvSample(["test-aa10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty);
3422     testTsvSample(["test-aa11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty);
3423     testTsvSample(["test-aa12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x0], data3x0);
3424     testTsvSample(["test-aa13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x0], data3x0);
3425     testTsvSample(["test-aa14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x1], data3x1);
3426     testTsvSample(["test-aa15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x1], data3x1);
3427     testTsvSample(["test-aa16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6);
3428     testTsvSample(["test-aa17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6);
3429     testTsvSample(["test-aa18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5);
3430     testTsvSample(["test-aa19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4);
3431     testTsvSample(["test-aa20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3);
3432     testTsvSample(["test-aa21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2);
3433     testTsvSample(["test-aa22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1);
3434 
3435     /* Inorder versions of Algorithm R tests. */
3436     testTsvSample(["test-ai10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty);
3437     testTsvSample(["test-ai11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty);
3438     testTsvSample(["test-ai12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0);
3439     testTsvSample(["test-ai13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0);
3440     testTsvSample(["test-ai14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1);
3441     testTsvSample(["test-ai15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1);
3442     testTsvSample(["test-ai16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder);
3443     testTsvSample(["test-ai17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder);
3444     testTsvSample(["test-ai18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5Inorder);
3445     testTsvSample(["test-ai19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4Inorder);
3446     testTsvSample(["test-ai20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3Inorder);
3447     testTsvSample(["test-ai21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2Inorder);
3448     testTsvSample(["test-ai22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1Inorder);
3449 
3450     /* Bernoulli sampling cases. */
3451     testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty);
3452     testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0);
3453     testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1);
3454     testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6);
3455     testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6);
3456     testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
3457     testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60);
3458     testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliCompatP60);
3459     testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs);
3460 
3461     /* Bernoulli sampling with probabilities in skip sampling range or preferring skip sampling. */
3462     testTsvSample(["test-ab1", "-H", "--seed-value", "333", "--prob", "0.01", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P01);
3463     testTsvSample(["test-ab2", "-H", "--seed-value", "333", "--prob", "0.02", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P02);
3464     testTsvSample(["test-ab3", "-H", "--seed-value", "333", "--prob", "0.03", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03);
3465     testTsvSample(["test-ab4", "-H", "--seed-value", "333", "--prob", "0.01", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P01);
3466     testTsvSample(["test-ab5", "-H", "--seed-value", "333", "--prob", "0.02", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P02);
3467     testTsvSample(["test-ab6", "-H", "--seed-value", "333", "--prob", "0.03", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P03);
3468     testTsvSample(["test-ab7", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40);
3469 
3470     /* Distinct sampling cases. */
3471     testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty);
3472     testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0);
3473     testTsvSample(["test-a24b", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "field_a", fpath_data3x0], data3x0);
3474     testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1);
3475     testTsvSample(["test-a25b", "-H", "-s", "-p", "1.0", "-k", "field_b", fpath_data3x1], data3x1);
3476     testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6);
3477     testTsvSample(["test-a26b", "-H", "-s", "-p", "1.0", "-k", "field_b", fpath_data3x6], data3x6);
3478     testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctK1K3P60);
3479     testTsvSample(["test-a27b", "-H", "-s", "-p", "0.6", "-k", "field_a,field_c", fpath_data3x6], data3x6ExpectedDistinctK1K3P60);
3480 
3481     /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling.
3482      * For weighted sampling, use the weighted cases, but with expected using the original ordering.
3483      */
3484     testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
3485     testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100);
3486     testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
3487                   data3x6ExpectedWt3ProbsInorder);
3488     testTsvSample(["test-a30b", "-H", "-s", "--gen-random-inorder", "--weight-field", "field_c", fpath_data3x6],
3489                   data3x6ExpectedWt3ProbsInorder);
3490     testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
3491                   data3x6ExpectedWt3V41ProbsInorder);
3492     testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6],
3493                   data3x6ExpectedDistinctK1K3P60Probs);
3494     testTsvSample(["test-a32b", "-H", "-s", "-p", "0.6", "-k", "field_a,field_c", "--print-random", fpath_data3x6],
3495                   data3x6ExpectedDistinctK1K3P60Probs);
3496     testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header",
3497                    "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctK1K3P60ProbsRVCustom);
3498     testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6],
3499                   data3x6ExpectedDistinctK2P2ProbsInorder);
3500 
3501     /* Simple random sampling with replacement. */
3502     testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty);
3503     testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty);
3504     testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0);
3505     testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0);
3506     testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplaceNum3);
3507     testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10);
3508     testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77);
3509 
3510     /* Shuffling, compatibility mode, without headers. */
3511     testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1 .. $]);
3512     testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1 .. $]);
3513     testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1 .. $]);
3514     testTsvSample(["test-b4", "-s", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1 .. $]);
3515     testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1 .. $]);
3516     testTsvSample(["test-b6", "-s", "--weight-field", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]);
3517     testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]);
3518     testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]);
3519     testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1 .. $]);
3520 
3521     /* Shuffling, no headers, without compatibility mode, or with printing and compatibility mode. */
3522     testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1 .. $]);
3523     testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1 .. $]);
3524     testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1 .. $]);
3525     testTsvSample(["test-bb4", "-s", fpath_data3x6_noheader], data3x6ExpectedPermuteSwap[1 .. $]);
3526     testTsvSample(["test-bb5", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]);
3527     testTsvSample(["test-bb6", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]);
3528     testTsvSample(["test-bb7", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]);
3529 
3530     /* Reservoir sampling using Algorithm R, no headers. */
3531     testTsvSample(["test-ac10", "--prefer-algorithm-r", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty);
3532     testTsvSample(["test-ac11", "--prefer-algorithm-r", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty);
3533     testTsvSample(["test-ac14", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x1_noheader], data3x1[1 .. $]);
3534     testTsvSample(["test-ac15", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x1_noheader], data3x1[1 .. $]);
3535     testTsvSample(["test-ac16", "--prefer-algorithm-r", "-s", "--num", "7", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]);
3536     testTsvSample(["test-ac17", "--prefer-algorithm-r", "-s", "--num", "6", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]);
3537     testTsvSample(["test-ac18", "--prefer-algorithm-r", "-s", "--num", "5", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5[1 .. $]);
3538     testTsvSample(["test-ac19", "--prefer-algorithm-r", "-s", "--num", "4", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4[1 .. $]);
3539     testTsvSample(["test-ac20", "--prefer-algorithm-r", "-s", "--num", "3", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3[1 .. $]);
3540     testTsvSample(["test-ac21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2[1 .. $]);
3541     testTsvSample(["test-ac22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1[1 .. $]);
3542 
3543     /* Reservoir sampling using Algorithm R, no headers, inorder output. */
3544     testTsvSample(["test-aj10", "--prefer-algorithm-r", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty);
3545     testTsvSample(["test-aj11", "--prefer-algorithm-r", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty);
3546     testTsvSample(["test-aj14", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3547     testTsvSample(["test-aj15", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3548     testTsvSample(["test-aj16", "--prefer-algorithm-r", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]);
3549     testTsvSample(["test-aj17", "--prefer-algorithm-r", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]);
3550     testTsvSample(["test-aj18", "--prefer-algorithm-r", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5Inorder[1 .. $]);
3551     testTsvSample(["test-aj19", "--prefer-algorithm-r", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4Inorder[1 .. $]);
3552     testTsvSample(["test-aj20", "--prefer-algorithm-r", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3Inorder[1 .. $]);
3553     testTsvSample(["test-aj21", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2Inorder[1 .. $]);
3554     testTsvSample(["test-aj22", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1Inorder[1 .. $]);
3555 
3556     /* Bernoulli sampling cases. */
3557     testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1 .. $]);
3558     testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]);
3559     testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]);
3560     testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]);
3561     testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1 .. $]);
3562     testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1 .. $]);
3563 
3564     /* Bernoulli sampling with probabilities in skip sampling range. */
3565     testTsvSample(["test-bb1", "-v", "333", "-p", "0.01", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P01[1 .. $]);
3566     testTsvSample(["test-bb2", "-v", "333", "-p", "0.02", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P02[1 .. $]);
3567     testTsvSample(["test-bb3", "-v", "333", "-p", "0.03", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1 .. $]);
3568     testTsvSample(["test-bb4", "-v", "333", "-p", "0.01", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P01[1 .. $]);
3569     testTsvSample(["test-bb5", "-v", "333", "-p", "0.02", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P02[1 .. $]);
3570     testTsvSample(["test-bb6", "-v", "333", "-p", "0.03", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P03[1 .. $]);
3571     testTsvSample(["test-bb7", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1 .. $]);
3572 
3573     /* Distinct sampling cases. */
3574     testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1 .. $]);
3575     testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]);
3576     testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]);
3577     testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]);
3578 
3579     /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */
3580     testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]);
3581     testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1 .. $]);
3582     testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader],
3583                   data3x6ExpectedDistinctK1K3P60Probs[1 .. $]);
3584     testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader],
3585                   data3x6ExpectedDistinctK2P2ProbsInorder[1 .. $]);
3586 
3587     /* Simple random sampling with replacement. */
3588     testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty);
3589     testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty);
3590     testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplaceNum3[1 .. $]);
3591     testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10[1 .. $]);
3592     testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1 .. $]);
3593 
3594     /* Multi-file tests. */
3595     testTsvSample(["test-c1", "--header", "--static-seed", "--compatibility-mode",
3596                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3597                   combo1ExpectedPermuteCompat);
3598     testTsvSample(["test-c2", "--header", "--static-seed", "--print-random",
3599                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3600                   combo1ExpectedPermuteCompatProbs);
3601     testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3",
3602                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3603                   combo1ExpectedPermuteWt3Probs);
3604     testTsvSample(["test-c3b", "--header", "--static-seed", "--print-random", "--weight-field", "field_c",
3605                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3606                   combo1ExpectedPermuteWt3Probs);
3607     testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", "--compatibility-mode",
3608                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3609                   combo1ExpectedPermuteWt3);
3610     testTsvSample(["test-c5", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4",
3611                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3612                   combo1ExpectedSampleAlgoRNum4);
3613     testTsvSample(["test-c5b", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder",
3614                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3615                   combo1ExpectedSampleAlgoRNum4Inorder);
3616 
3617     /* Multi-file, no headers. */
3618     testTsvSample(["test-c6", "--static-seed", "--compatibility-mode",
3619                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3620                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3621                   combo1ExpectedPermuteCompat[1 .. $]);
3622     testTsvSample(["test-c7", "--static-seed", "--print-random",
3623                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3624                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3625                   combo1ExpectedPermuteCompatProbs[1 .. $]);
3626     testTsvSample(["test-c8", "--static-seed", "--print-random", "--weight-field", "3",
3627                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3628                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3629                   combo1ExpectedPermuteWt3Probs[1 .. $]);
3630     testTsvSample(["test-c9", "--static-seed", "--weight-field", "3", "--compatibility-mode",
3631                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3632                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3633                   combo1ExpectedPermuteWt3[1 .. $]);
3634     testTsvSample(["test-c10", "--static-seed", "--prefer-algorithm-r", "--num", "4",
3635                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3636                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3637                   combo1ExpectedSampleAlgoRNum4[1 .. $]);
3638     testTsvSample(["test-c10b", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder",
3639                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3640                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3641                   combo1ExpectedSampleAlgoRNum4Inorder[1 .. $]);
3642 
3643     /* Bernoulli sampling cases. */
3644     testTsvSample(["test-c11", "--header", "--static-seed", "--print-random", "--prob", ".5",
3645                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3646                   combo1ExpectedBernoulliCompatP50Probs);
3647     testTsvSample(["test-c12", "--header", "--static-seed", "--prob", ".4",
3648                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3649                   combo1ExpectedBernoulliCompatP40);
3650     testTsvSample(["test-c13", "--static-seed", "--print-random", "--prob", ".5",
3651                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3652                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3653                   combo1ExpectedBernoulliCompatP50Probs[1 .. $]);
3654     testTsvSample(["test-c14", "--static-seed", "--prob", ".4",
3655                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3656                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3657                   combo1ExpectedBernoulliCompatP40[1 .. $]);
3658 
3659     /* Bernoulli sampling with probabilities in skip sampling range. */
3660     testTsvSample(["test-cc1", "-H", "-v", "333", "-p", "0.03",
3661                    fpath_data3x0, fpath_data3x1, fpath_data1x200, fpath_dataEmpty, fpath_data1x10],
3662                   combo2ExpectedBernoulliSkipV333P03);
3663     testTsvSample(["test-cc1", "-v", "333", "-p", "0.03",
3664                    fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader],
3665                   combo2ExpectedBernoulliSkipV333P03[1 .. $]);
3666 
3667     /* Distinct sampling cases. */
3668     testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4",
3669                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3670                   combo1ExpectedDistinctK1P40);
3671     testTsvSample(["test-c13b", "--header", "--static-seed", "--key-fields", "field_a", "--prob", ".4",
3672                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3673                   combo1ExpectedDistinctK1P40);
3674     testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--prob", ".4",
3675                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3676                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3677                   combo1ExpectedDistinctK1P40[1 .. $]);
3678 
3679     /* Generating random weights. */
3680     testTsvSample(["test-c15", "--header", "--static-seed", "--gen-random-inorder",
3681                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3682                   combo1ExpectedProbsInorder);
3683     testTsvSample(["test-c16", "--static-seed", "--gen-random-inorder",
3684                    fpath_data3x3_noheader, fpath_data3x1_noheader,
3685                    fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader],
3686                   combo1ExpectedProbsInorder[1 .. $]);
3687 
3688     /* Simple random sampling with replacement. */
3689     testTsvSample(["test-c17", "--header", "--static-seed", "--replace", "--num", "10",
3690                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
3691                   combo1ExpectedReplaceNum10);
3692 
3693     testTsvSample(["test-c18", "--static-seed", "--replace", "--num", "10",
3694                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
3695                    fpath_data3x6_noheader, fpath_data3x2_noheader],
3696                   combo1ExpectedReplaceNum10[1 .. $]);
3697 
3698     /* Single column file. */
3699     testTsvSample(["test-d1", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat);
3700     testTsvSample(["test-d2", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat);
3701 
3702     /* Distributions. */
3703     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs);
3704     testTsvSample(["test-e1b", "-H", "-s", "-w", "weight", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs);
3705     testTsvSample(["test-e2", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedPermuteWt2Probs);
3706     testTsvSample(["test-e3", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedPermuteWt2Probs);
3707     testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs);
3708     testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs);
3709 
3710     /* Tests of subset sample (--n|num) field. Random sampling, Bernoulli sampling, distinct sampling.
3711      *
3712      * Note: The way these tests are done ensures that subset length does not affect
3713      * output order.
3714      */
3715     import std.algorithm : min;
3716     for (size_t n = data3x6.length + 2; n >= 1; n--)
3717     {
3718         /* reservoirSamplingViaHeap.
3719          */
3720         size_t expectedLength = min(data3x6.length, n + 1);
3721         testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string,
3722                        "-H", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]);
3723 
3724         testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string,
3725                        "-H", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]);
3726 
3727         testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string,
3728                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs[0..expectedLength]);
3729 
3730         testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string,
3731                        "-H", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3[0..expectedLength]);
3732 
3733         testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string,
3734                        "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs[0..expectedLength]);
3735 
3736         testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string,
3737                        fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..expectedLength]);
3738 
3739         testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string,
3740                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..expectedLength]);
3741 
3742         testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string,
3743                        "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..expectedLength]);
3744 
3745         testTsvSample([format("test-f9_%d", n), "-s", "-n", n.to!string,
3746                        "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..expectedLength]);
3747 
3748         /* Bernoulli sampling.
3749          */
3750         import std.algorithm : min;
3751         size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedBernoulliCompatProbsP60.length);
3752 
3753         testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3754                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60[0..sampleExpectedLength]);
3755 
3756         testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3757                        "-H", fpath_data3x6], data3x6ExpectedBernoulliCompatP60[0..sampleExpectedLength]);
3758 
3759         testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3760                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..sampleExpectedLength]);
3761 
3762         testTsvSample([format("test-f13_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
3763                        fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60[1..sampleExpectedLength]);
3764 
3765         /* Distinct Sampling.
3766          */
3767         size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctK1K3P60.length);
3768 
3769         testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
3770                        "-H", fpath_data3x6], data3x6ExpectedDistinctK1K3P60[0..distinctExpectedLength]);
3771 
3772         testTsvSample([format("test-f15_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
3773                        fpath_data3x6_noheader], data3x6ExpectedDistinctK1K3P60[1..distinctExpectedLength]);
3774 
3775         testTsvSample([format("test-f16_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
3776                        "-H", fpath_data3x6], data3x6ExpectedBernoulliProbsP100[0..expectedLength]);
3777 
3778         testTsvSample([format("test-f17_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
3779                        fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]);
3780     }
3781 
3782     /* Similar tests with the 1x10 data set. */
3783     for (size_t n = data1x10.length + 2; n >= 1; n--)
3784     {
3785         size_t expectedLength = min(data1x10.length, n + 1);
3786         testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string,
3787                        "-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]);
3788 
3789         testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string,
3790                        "-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]);
3791 
3792         testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string,
3793                        fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]);
3794 
3795         testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string,
3796                        "-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]);
3797     }
3798 
3799     /* Simple random sampling with replacement: ensure sample size doesn't change order. */
3800     for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--)
3801     {
3802         testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6],
3803                       data3x6ExpectedReplaceNum10[0 .. n + 1]);
3804 
3805         testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader],
3806                       data3x6ExpectedReplaceNum10[1 .. n + 1]);
3807     }
3808 
3809     /* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */
3810     for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--)
3811     {
3812         size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1);
3813 
3814         testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
3815                        "-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]);
3816 
3817         testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string,
3818                        fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]);
3819     }
3820 
3821     /* Inorder sampling tests using reservoir sampling via heap (compatibility mode). */
3822     testTsvSample(["test-ar10", "--compatibility-mode", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty);
3823     testTsvSample(["test-ar11", "--compatibility-mode", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty);
3824     testTsvSample(["test-ar12", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0);
3825     testTsvSample(["test-ar13", "--compatibility-mode", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0);
3826     testTsvSample(["test-ar14", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1);
3827     testTsvSample(["test-ar15", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1);
3828     testTsvSample(["test-ar16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder);
3829     testTsvSample(["test-ar17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder);
3830     testTsvSample(["test-ar18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum5Inorder);
3831     testTsvSample(["test-ar19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", fpath_data3x6],         data3x6ExpectedSampleCompatNum4Inorder);
3832     testTsvSample(["test-ar20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum3Inorder);
3833     testTsvSample(["test-ar21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum2Inorder);
3834     testTsvSample(["test-ar22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum1Inorder);
3835 
3836     testTsvSample(["test-as10", "--compatibility-mode", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty);
3837     testTsvSample(["test-as11", "--compatibility-mode", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty);
3838     testTsvSample(["test-as14", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3839     testTsvSample(["test-as15", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]);
3840     testTsvSample(["test-as16", "--compatibility-mode", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]);
3841     testTsvSample(["test-as17", "--compatibility-mode", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]);
3842     testTsvSample(["test-as18", "--compatibility-mode", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5Inorder[1 .. $]);
3843     testTsvSample(["test-as19", "--compatibility-mode", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4Inorder[1 .. $]);
3844     testTsvSample(["test-as20", "--compatibility-mode", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3Inorder[1 .. $]);
3845     testTsvSample(["test-as21", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2Inorder[1 .. $]);
3846     testTsvSample(["test-as22", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1Inorder[1 .. $]);
3847 
3848     /* Inorder sampling tests with random number printing. --compatibility-mode not needed. */
3849     testTsvSample(["test-at16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder);
3850     testTsvSample(["test-at17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder);
3851     testTsvSample(["test-at18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum5ProbsInorder);
3852     testTsvSample(["test-at19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder);
3853     testTsvSample(["test-at19",                         "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder);
3854     testTsvSample(["test-at20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder);
3855     testTsvSample(["test-at20",                         "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder);
3856     testTsvSample(["test-at21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum2ProbsInorder);
3857     testTsvSample(["test-at22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum1ProbsInorder);
3858 
3859     testTsvSample(["test-au16", "--compatibility-mode", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]);
3860     testTsvSample(["test-au17", "--compatibility-mode", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]);
3861     testTsvSample(["test-au18", "--compatibility-mode", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5ProbsInorder[1 .. $]);
3862     testTsvSample(["test-au19", "--compatibility-mode", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]);
3863     testTsvSample(["test-au19",                         "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]);
3864     testTsvSample(["test-au20", "--compatibility-mode", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3ProbsInorder[1 .. $]);
3865     testTsvSample(["test-au21", "--compatibility-mode", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2ProbsInorder[1 .. $]);
3866     testTsvSample(["test-au22", "--compatibility-mode", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1ProbsInorder[1 .. $]);
3867 
3868     /* Inorder weighted sampling tests. */
3869     testTsvSample(["test-ax16", "-H", "-s", "-n", "7", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder);
3870     testTsvSample(["test-ax17", "-H", "-s", "-n", "6", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder);
3871     testTsvSample(["test-ax18", "-H", "-s", "-n", "5", "-i", fpath_data3x6], data3x6ExpectedWt3Num5Inorder);
3872     testTsvSample(["test-ax19", "-H", "-s", "-n", "4", "-i", fpath_data3x6], data3x6ExpectedWt3Num4Inorder);
3873     testTsvSample(["test-ax20", "-H", "-s", "-n", "3", "-i", fpath_data3x6], data3x6ExpectedWt3Num3Inorder);
3874     testTsvSample(["test-ax21", "-H", "-s", "-n", "2", "-i", fpath_data3x6], data3x6ExpectedWt3Num2Inorder);
3875     testTsvSample(["test-ax22", "-H", "-s", "-n", "1", "-i", fpath_data3x6], data3x6ExpectedWt3Num1Inorder);
3876 
3877     testTsvSample(["test-ay16", "-s", "-n", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]);
3878     testTsvSample(["test-ay17", "-s", "-n", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]);
3879     testTsvSample(["test-ay18", "-s", "-n", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num5Inorder[1 .. $]);
3880     testTsvSample(["test-ay19", "-s", "-n", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num4Inorder[1 .. $]);
3881     testTsvSample(["test-ay20", "-s", "-n", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num3Inorder[1 .. $]);
3882     testTsvSample(["test-ay21", "-s", "-n", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num2Inorder[1 .. $]);
3883     testTsvSample(["test-ay22", "-s", "-n", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num1Inorder[1 .. $]);
3884 
3885     /*
3886      * Distinct sampling tests.
3887      */
3888     testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25],
3889                   data5x25ExpectedDistinctK2P40);
3890 
3891     testTsvSample(["test-j1b", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "Shape", fpath_data5x25],
3892                   data5x25ExpectedDistinctK2P40);
3893 
3894     testTsvSample(["test-j2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25],
3895                   data5x25ExpectedDistinctK2K4P20);
3896 
3897     testTsvSample(["test-j2b", "-H", "-s", "-p", "0.20", "-k", "Shape,Size", fpath_data5x25],
3898                   data5x25ExpectedDistinctK2K4P20);
3899 
3900     testTsvSample(["test-j3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25],
3901                   data5x25ExpectedDistinctK2K3K4P20);
3902 
3903     testTsvSample(["test-j3b", "-H", "-s", "-p", "0.20", "-k", "Shape-Size", fpath_data5x25],
3904                   data5x25ExpectedDistinctK2K3K4P20);
3905 
3906     testTsvSample(["test-j4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader],
3907                   data5x25ExpectedDistinctK2P40[1 .. $]);
3908 
3909     testTsvSample(["test-j5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader],
3910                   data5x25ExpectedDistinctK2K4P20[1 .. $]);
3911 
3912     testTsvSample(["test-j6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader],
3913                   data5x25ExpectedDistinctK2K3K4P20[1 .. $]);
3914 
3915 
3916     /* These distinct tests check that the whole line as '-k 0' and specifying all fields
3917      * in order have the same result. Also that field numbers don't matter, as '-k 1,2'
3918      * in data2x25 are the same keys as '-k 2,4' in data5x25.
3919      */
3920     testTsvSample(["test-j7", "-H", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25],
3921                   data2x25ExpectedDistinctK1K2P20);
3922 
3923     testTsvSample(["test-j8", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data2x25],
3924                   data2x25ExpectedDistinctK1K2P20);
3925 
3926     testTsvSample(["test-j8b", "-H", "-s", "-p", "0.20", "-k", "*", fpath_data2x25],
3927                   data2x25ExpectedDistinctK1K2P20);
3928 
3929     testTsvSample(["test-j9", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25_noheader],
3930                   data2x25ExpectedDistinctK1K2P20[1 .. $]);
3931 
3932     testTsvSample(["test-j10", "-s", "-p", "0.20", "-k", "0", fpath_data2x25_noheader],
3933                   data2x25ExpectedDistinctK1K2P20[1 .. $]);
3934 
3935     /* Similar to the last set, but for a 1-column file. Also with random value printing. */
3936     testTsvSample(["test-j11", "-H", "-s", "-p", "0.20", "-k", "1", fpath_data1x25],
3937                   data1x25ExpectedDistinctK1P20);
3938 
3939     testTsvSample(["test-j12", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data1x25],
3940                   data1x25ExpectedDistinctK1P20);
3941 
3942     testTsvSample(["test-j12b", "-H", "-s", "-p", "0.20", "-k", "*", fpath_data1x25],
3943                   data1x25ExpectedDistinctK1P20);
3944 
3945     testTsvSample(["test-j13", "-s", "-p", "0.20", "-k", "1", fpath_data1x25_noheader],
3946                   data1x25ExpectedDistinctK1P20[1 .. $]);
3947 
3948     testTsvSample(["test-j14", "-s", "-p", "0.20", "-k", "0", fpath_data1x25_noheader],
3949                   data1x25ExpectedDistinctK1P20[1 .. $]);
3950 
3951     testTsvSample(["test-j15", "-H", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25],
3952                   data1x25ExpectedDistinctK1P20Probs);
3953 
3954     testTsvSample(["test-j15b", "-H", "-s", "-p", "0.20", "-k", `Shape\-Size`, "--print-random", fpath_data1x25],
3955                   data1x25ExpectedDistinctK1P20Probs);
3956 
3957     testTsvSample(["test-j16", "-H", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25],
3958                   data1x25ExpectedDistinctK1P20Probs);
3959 
3960     testTsvSample(["test-j16b", "-H", "-s", "-p", "0.20", "-k", "*", "--print-random", fpath_data1x25],
3961                   data1x25ExpectedDistinctK1P20Probs);
3962 
3963     testTsvSample(["test-j17", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25_noheader],
3964                   data1x25ExpectedDistinctK1P20Probs[1 .. $]);
3965 
3966     testTsvSample(["test-j18", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25_noheader],
3967                   data1x25ExpectedDistinctK1P20Probs[1 .. $]);
3968 
3969     testTsvSample(["test-j19", "-H", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25],
3970                   data1x25ExpectedDistinctK1P20ProbsInorder);
3971 
3972     testTsvSample(["test-j19b", "-H", "-s", "-p", "0.20", "-k", `Shape\-Size`, "--gen-random-inorder", fpath_data1x25],
3973                   data1x25ExpectedDistinctK1P20ProbsInorder);
3974 
3975     testTsvSample(["test-j20", "-H", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25],
3976                   data1x25ExpectedDistinctK1P20ProbsInorder);
3977 
3978     testTsvSample(["test-j20b", "-H", "-s", "-p", "0.20", "-k", "*", "--gen-random-inorder", fpath_data1x25],
3979                   data1x25ExpectedDistinctK1P20ProbsInorder);
3980 
3981     testTsvSample(["test-j21", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25_noheader],
3982                   data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]);
3983 
3984     testTsvSample(["test-j22", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25_noheader],
3985                   data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]);
3986 
3987 }