1 /**
2 Command line tool for randomizing or sampling lines from input streams. Several
3 sampling methods are available, including simple random sampling, weighted random
4 sampling, Bernoulli sampling, and distinct sampling.
5 
6 Copyright (c) 2017-2018, eBay Software Foundation
7 Initially written by Jon Degenhardt
8 
9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
10 */
11 module tsv_sample;
12 
13 import std.range;
14 import std.stdio;
15 import std.typecons : tuple, Flag;
16 
17 version(unittest)
18 {
19     // When running unit tests, use main from -main compiler switch.
20 }
21 else
22 {
23     int main(string[] cmdArgs)
24     {
25         /* When running in DMD code coverage mode, turn on report merging. */
26         version(D_Coverage) version(DigitalMars)
27         {
28             import core.runtime : dmd_coverSetMerge;
29             dmd_coverSetMerge(true);
30         }
31 
32         TsvSampleOptions cmdopt;
33         auto r = cmdopt.processArgs(cmdArgs);
34         if (!r[0]) return r[1];
35         version(LDC_Profile)
36         {
37             import ldc.profile : resetAll;
38             resetAll();
39         }
40         try
41         {
42             import tsvutil : BufferedOutputRange;
43             auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
44 
45             tsvSample(cmdopt, bufferedOutput);
46         }
47         catch (Exception exc)
48         {
49             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
50             return 1;
51         }
52         return 0;
53     }
54 }
55 
56 auto helpText = q"EOS
57 Synopsis: tsv-sample [options] [file...]
58 
59 Sample input lines or randomize their order. Several modes of operation
60 are available:
61 * Line order randomization (the default): All input lines are output in a
62   random order. All orderings are equally likely.
63 * Weighted line order randomization (--w|weight-field): Lines are selected
64   using weighted random sampling, with the weight taken from a field.
65   Lines are output in weighted selection order, reordering the lines.
66 * Sampling with replacement (--r|replace, --n|num): All input is read into
67   memory, then lines are repeatedly selected at random and written out. This
68   continues until --n|num samples are output. Lines can be selected multiple
69   times. Output continues forever if --n|num is zero or not specified.
70 * Bernoulli sampling (--p|prob): A random subset of lines is output based
71   on an inclusion probability. This is a streaming operation. A selection
72   decision is made on each line as is it read. Line order is not changed.
73 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled
74   based on the values in the key field. A subset of the keys are chosen
75   based on the inclusion probability (a 'distinct' set of keys). All lines
76   with one of the selected keys are output. Line order is not changed.
77 
78 The '--n|num' option limits the sample size produced. It speeds up line
79 order randomization and weighted sampling significantly. It is also used
80 to terminate sampling with replacement.
81 
82 Use '--help-verbose' for detailed information.
83 
84 Options:
85 EOS";
86 
87 auto helpTextVerbose = q"EOS
88 Synopsis: tsv-sample [options] [file...]
89 
90 Sample input lines or randomize their order. Several modes of operation
91 are available:
92 * Line order randomization (the default): All input lines are output in a
93   random order. All orderings are equally likely.
94 * Weighted line order randomization (--w|weight-field): Lines are selected
95   using weighted random sampling, with the weight taken from a field.
96   Lines are output in weighted selection order, reordering the lines.
97 * Sampling with replacement (--r|replace, --n|num): All input is read into
98   memory, then lines are repeatedly selected at random and written out. This
99   continues until --n|num samples are output. Lines can be selected multiple
100   times. Output continues forever if --n|num is zero or not specified.
101 * Bernoulli sampling (--p|prob): A random subset of lines is output based
102   on an inclusion probability. This is a streaming operation. A selection
103   decision is made on each line as is it read. Lines order is not changed.
104 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled
105   based on the values in the key field. A subset of the keys are chosen
106   based on the inclusion probability (a 'distinct' set of keys). All lines
107   with one of the selected keys are output. Line order is not changed.
108 
109 Sample size: The '--n|num' option limits the sample size produced. This
110 speeds up line order randomization and weighted sampling significantly
111 (details below). It is also used to terminate sampling with replacement.
112 
113 Controlling the random seed: By default, each run produces a different
114 randomization or sampling. Using '--s|static-seed' changes this so
115 multiple runs produce the same results. This works by using the same
116 random seed each run. The random seed can be specified using
117 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero
118 value is a no-op and ignored.)
119 
120 Memory use: Bernoulli sampling and distinct sampling make decisions on
121 each line as it is read, so there is no memory accumulation. These
122 algorithms support arbitrary size inputs. Sampling with replacement reads
123 all lines into memory and is limited by available memory. The line order
124 randomization algorithms hold the full output set in memory prior to
125 generating results. This ultimately limits the size of the output set. For
126 these memory needs can be reduced by using a sample size (--n|num). This
127 engages reservior sampling. Output order is not affected. Both
128 'tsv-sample -n 1000' and 'tsv-sample | head -n 1000' produce the same
129 results, but the former is quite a bit faster.
130 
131 Weighted sampling: Weighted random sampling is done using an algorithm
132 described by Pavlos Efraimidis and Paul Spirakis. Weights should be
133 positive values representing the relative weight of the entry in the
134 collection. Counts and similar can be used as weights, it is *not*
135 necessary to normalize to a [0,1] interval. Negative values are not
136 meaningful and given the value zero. Input order is not retained, instead
137 lines are output ordered by the randomized weight that was assigned. This
138 means that a smaller valid sample can be produced by taking the first N
139 lines of output. For more info on the sampling approach see:
140 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling
141 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis
142   (https://arxiv.org/abs/1012.0256)
143 
144 Printing random values: Most of the algorithms work by generating a random
145 value for each line. The nature of these values depends on the sampling
146 algorithm. They are used for both line selection and output ordering. The
147 '--p|print-random' option can be used to print these values. The random
148 value is prepended to the line separated by the --d|delimiter char (TAB by
149 default). The '--q|gen-random-inorder' option takes this one step further,
150 generating random values for all input lines without changing the input
151 order. The types of values currently used by these sampling algorithms:
152 * Unweighted sampling: Uniform random value in the interval [0,1]. This
153   includes Bernoulli sampling and unweighted line order randomization.
154 * Weighted sampling: Value in the interval [0,1]. Distribution depends on
155   the values in the weight field. It is used as a partial ordering.
156 * Distinct sampling: An integer, zero and up, representing a selection
157   group. The inclusion probability determines the number of selection groups.
158 * Sampling with replacement: Random value printing is not supported.
159 
160 The specifics behind these random values are subject to change in future
161 releases.
162 
163 Options:
164 EOS";
165 
166 /** Container for command line options.
167  */
168 struct TsvSampleOptions
169 {
170     string programName;
171     string[] files;
172     bool helpVerbose = false;                  // --help-verbose
173     bool hasHeader = false;                    // --H|header
174     size_t sampleSize = 0;                     // --n|num - Size of the desired sample
175     double inclusionProbability = double.nan;  // --p|prob - Inclusion probability
176     size_t[] keyFields;                        // --k|key-fields - Used with inclusion probability
177     size_t weightField = 0;                    // --w|weight-field - Field holding the weight
178     bool srsWithReplacement = false;           // --r|replace
179     bool staticSeed = false;                   // --s|static-seed
180     uint seedValueOptionArg = 0;               // --v|seed-value
181     bool printRandom = false;                  // --print-random
182     bool genRandomInorder = false;             // --gen-random-inorder
183     string randomValueHeader = "random_value"; // --random-value-header
184     char delim = '\t';                         // --d|delimiter
185     bool versionWanted = false;                // --V|version
186     bool hasWeightField = false;               // Derived.
187     bool useBernoulliSampling = false;         // Derived.
188     bool useDistinctSampling = false;          // Derived.
189     bool usingUnpredictableSeed = true;        // Derived from --static-seed, --seed-value
190     uint seed = 0;                             // Derived from --static-seed, --seed-value
191 
192     auto processArgs(ref string[] cmdArgs)
193     {
194         import std.algorithm : canFind;
195         import std.getopt;
196         import std.math : isNaN;
197         import std.path : baseName, stripExtension;
198         import std.typecons : Yes, No;
199         import tsvutil : makeFieldListOptionHandler;
200 
201         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
202 
203         try
204         {
205             arraySep = ",";    // Use comma to separate values in command line options
206             auto r = getopt(
207                 cmdArgs,
208                 "help-verbose",    "     Print more detailed help.", &helpVerbose,
209 
210                 std.getopt.config.caseSensitive,
211                 "H|header",        "     Treat the first line of each file as a header.", &hasHeader,
212                 std.getopt.config.caseInsensitive,
213 
214                 "n|num",           "NUM  Maximim number of lines to output. All selected lines are output if not provided or zero.", &sampleSize,
215                 "p|prob",          "NUM  Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability,
216 
217                 "k|key-fields",    "<field-list>  Fields to use as key for distinct sampling. Use with --p|prob.",
218                 keyFields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex),
219 
220                 "w|weight-field",  "NUM  Field containing weights. All lines get equal weight if not provided or zero.", &weightField,
221                 "r|replace",       "     Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement,
222                 "s|static-seed",   "     Use the same random seed every run.", &staticSeed,
223 
224                 std.getopt.config.caseSensitive,
225                 "v|seed-value",    "NUM  Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg,
226                 std.getopt.config.caseInsensitive,
227 
228                 "print-random",       "     Include the assigned random value (prepended) when writing output lines.", &printRandom,
229                 "gen-random-inorder", "     Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder,
230                 "random-value-header",  "     Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader,
231 
232                 "d|delimiter",     "CHR  Field delimiter.", &delim,
233 
234                 std.getopt.config.caseSensitive,
235                 "V|version",       "     Print version information and exit.", &versionWanted,
236                 std.getopt.config.caseInsensitive,
237                 );
238 
239             if (r.helpWanted)
240             {
241                 defaultGetoptPrinter(helpText, r.options);
242                 return tuple(false, 0);
243             }
244             else if (helpVerbose)
245             {
246                 defaultGetoptPrinter(helpTextVerbose, r.options);
247                 return tuple(false, 0);
248             }
249             else if (versionWanted)
250             {
251                 import tsvutils_version;
252                 writeln(tsvutilsVersionNotice("tsv-sample"));
253                 return tuple(false, 0);
254             }
255 
256             /* Derivations and validations. */
257             if (weightField > 0)
258             {
259                 hasWeightField = true;
260                 weightField--;    // Switch to zero-based indexes.
261             }
262 
263             if (srsWithReplacement)
264             {
265                 if (hasWeightField)
266                 {
267                     throw new Exception("Sampling with replacement (--r|replace) does not support wieghts (--w|weight-field).");
268                 }
269                 else if (!inclusionProbability.isNaN)
270                 {
271                     throw new Exception("Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob).");
272                 }
273                 else if (keyFields.length > 0)
274                 {
275                     throw new Exception("Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields).");
276                 }
277                 else if (printRandom || genRandomInorder)
278                 {
279                     throw new Exception("Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder).");
280                 }
281             }
282 
283             if (keyFields.length > 0)
284             {
285                 if (inclusionProbability.isNaN) throw new Exception("--p|prob is required when using --k|key-fields.");
286             }
287 
288             /* Inclusion probability (--p|prob) is used for both Bernoulli sampling and distinct sampling. */
289             if (!inclusionProbability.isNaN)
290             {
291                 if (inclusionProbability <= 0.0 || inclusionProbability > 1.0)
292                 {
293                     import std.format : format;
294                     throw new Exception(
295                         format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability));
296                 }
297 
298                 if (keyFields.length > 0) useDistinctSampling = true;
299                 else useBernoulliSampling = true;
300 
301                 if (hasWeightField) throw new Exception("--w|weight-field and --p|prob cannot be used together.");
302                 if (genRandomInorder && !useDistinctSampling) throw new Exception("--q|gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used.");
303             }
304             else if (genRandomInorder && !hasWeightField)
305             {
306                 useBernoulliSampling = true;
307             }
308 
309             if (randomValueHeader.length == 0 || randomValueHeader.canFind('\n') ||
310                 randomValueHeader.canFind(delim))
311             {
312                 throw new Exception("--randomValueHeader must be at least one character and not contain field delimiters or newlines.");
313             }
314 
315             /* Seed. */
316             import std.random : unpredictableSeed;
317 
318             usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0);
319 
320             if (usingUnpredictableSeed) seed = unpredictableSeed;
321             else if (seedValueOptionArg != 0) seed = seedValueOptionArg;
322             else if (staticSeed) seed = 2438424139;
323             else assert(0, "Internal error, invalid seed option states.");
324 
325             /* Assume remaining args are files. Use standard input if files were not provided. */
326             files ~= (cmdArgs.length > 1) ? cmdArgs[1..$] : ["-"];
327             cmdArgs.length = 1;
328         }
329         catch (Exception exc)
330         {
331             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
332             return tuple(false, 1);
333         }
334         return tuple(true, 0);
335     }
336 }
337 /** Invokes the appropriate sampling routine based on the command line arguments.
338  */
339 void tsvSample(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
340 if (isOutputRange!(OutputRange, char))
341 {
342     if (cmdopt.srsWithReplacement)
343     {
344         simpleRandomSamplingWithReplacement(cmdopt, outputStream);
345     }
346     else if (cmdopt.useBernoulliSampling)
347     {
348         if (cmdopt.genRandomInorder) bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
349         else bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream);
350     }
351     else if (cmdopt.useDistinctSampling)
352     {
353         if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream);
354         else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream);
355     }
356     else if (cmdopt.genRandomInorder)
357     {
358         assert(cmdopt.hasWeightField);
359         generateWeightedRandomValuesInorder(cmdopt, outputStream);
360     }
361     else if (cmdopt.sampleSize != 0)
362     {
363         if (cmdopt.hasWeightField) reservoirSampling!(Yes.isWeighted)(cmdopt, outputStream);
364         else reservoirSampling!(No.isWeighted)(cmdopt, outputStream);
365     }
366     else
367     {
368         if (cmdopt.hasWeightField) randomizeLines!(Yes.isWeighted)(cmdopt, outputStream);
369         else randomizeLines!(No.isWeighted)(cmdopt, outputStream);
370     }
371 }
372 
373 /** Bernoulli sampling on the input stream. Each input line is a assigned a random
374  * value and output if less than the inclusion probability. The order of the lines
375  * is not changed.
376  *
377  * Note: Performance tests show that skip sampling is faster when the inclusion
378  * probability is approximately 4-5% or less. A performance optimization would be to
379  * create a separate function for cases when the probability is small and the random
380  * weights are not being output with each line. A disadvantage would be that the
381  * random weights assigned to each element would change based on the sampling. Printed
382  * weights would no longer be consistent run-to-run.
383  */
384 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
385     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
386 if (isOutputRange!(OutputRange, char))
387 {
388     import std.format : formatValue, singleSpec;
389     import std.random : Random, uniform01;
390     import tsvutil : throwIfWindowsNewlineOnUnix;
391 
392     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
393     else assert(!cmdopt.genRandomInorder);
394 
395     auto randomGenerator = Random(cmdopt.seed);
396     immutable randomValueFormatSpec = singleSpec("%.17g");
397 
398     /* Process each line. */
399     bool headerWritten = false;
400     size_t numLinesWritten = 0;
401     foreach (filename; cmdopt.files)
402     {
403         auto inputStream = (filename == "-") ? stdin : filename.File();
404         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
405         {
406             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
407             if (fileLineNum == 1 && cmdopt.hasHeader)
408             {
409                 if (!headerWritten)
410                 {
411                     static if (generateRandomAll)
412                     {
413                         outputStream.put(cmdopt.randomValueHeader);
414                         outputStream.put(cmdopt.delim);
415                     }
416                     else if (cmdopt.printRandom)
417                     {
418                         outputStream.put(cmdopt.randomValueHeader);
419                         outputStream.put(cmdopt.delim);
420                     }
421 
422                     outputStream.put(line);
423                     outputStream.put("\n");
424                     headerWritten = true;
425                 }
426             }
427             else
428             {
429                 double lineScore = uniform01(randomGenerator);
430 
431                 static if (generateRandomAll)
432                 {
433                     outputStream.formatValue(lineScore, randomValueFormatSpec);
434                     outputStream.put(cmdopt.delim);
435                     outputStream.put(line);
436                     outputStream.put("\n");
437 
438                     if (cmdopt.sampleSize != 0)
439                     {
440                         ++numLinesWritten;
441                         if (numLinesWritten == cmdopt.sampleSize) return;
442                     }
443                 }
444                 else if (lineScore < cmdopt.inclusionProbability)
445                 {
446                     if (cmdopt.printRandom)
447                     {
448                         outputStream.formatValue(lineScore, randomValueFormatSpec);
449                         outputStream.put(cmdopt.delim);
450                     }
451                     outputStream.put(line);
452                     outputStream.put("\n");
453 
454                     if (cmdopt.sampleSize != 0)
455                     {
456                         ++numLinesWritten;
457                         if (numLinesWritten == cmdopt.sampleSize) return;
458                     }
459                 }
460             }
461         }
462     }
463 }
464 
465 /** Sample a subset of the unique values from the key fields.
466  *
467  * Distinct sampling is done by hashing the key and mapping the hash value into
468  * buckets matching the inclusion probability. Records having a key mapping to bucket
469  * zero are output.
470  */
471 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange)
472     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
473 if (isOutputRange!(OutputRange, char))
474 {
475     import std.algorithm : splitter;
476     import std.conv : to;
477     import std.digest.murmurhash;
478     import std.math : lrint;
479     import tsvutil : InputFieldReordering, throwIfWindowsNewlineOnUnix;
480 
481     static if (generateRandomAll) assert(cmdopt.genRandomInorder);
482     else assert(!cmdopt.genRandomInorder);
483 
484     assert(cmdopt.keyFields.length > 0);
485     assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0);
486 
487     static if (generateRandomAll)
488     {
489         import std.format : formatValue, singleSpec;
490         immutable randomValueFormatSpec = singleSpec("%d");
491     }
492 
493     immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys.
494 
495     uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint;
496 
497     /* Create a mapping for the key fields. */
498     auto keyFieldsReordering = new InputFieldReordering!char(cmdopt.keyFields);
499 
500     /* Process each line. */
501     bool headerWritten = false;
502     size_t numLinesWritten = 0;
503     foreach (filename; cmdopt.files)
504     {
505         auto inputStream = (filename == "-") ? stdin : filename.File();
506         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
507         {
508             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
509             if (fileLineNum == 1 && cmdopt.hasHeader)
510             {
511                 if (!headerWritten)
512                 {
513                     static if (generateRandomAll)
514                     {
515                         outputStream.put(cmdopt.randomValueHeader);
516                         outputStream.put(cmdopt.delim);
517                     }
518                     else if (cmdopt.printRandom)
519                     {
520                         outputStream.put(cmdopt.randomValueHeader);
521                         outputStream.put(cmdopt.delim);
522                     }
523 
524                     outputStream.put(line);
525                     outputStream.put("\n");
526                     headerWritten = true;
527                 }
528             }
529             else
530             {
531                 /* Gather the key field values and assemble the key. */
532                 keyFieldsReordering.initNewLine;
533                 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
534                 {
535                     keyFieldsReordering.processNextField(fieldIndex, fieldValue);
536                     if (keyFieldsReordering.allFieldsFilled) break;
537                 }
538 
539                 if (!keyFieldsReordering.allFieldsFilled)
540                 {
541                     import std.format : format;
542                     throw new Exception(
543                         format("Not enough fields in line. File: %s, Line: %s",
544                                (filename == "-") ? "Standard Input" : filename, fileLineNum));
545                 }
546 
547                 auto hasher = MurmurHash3!32(cmdopt.seed);
548                 foreach (count, key; keyFieldsReordering.outputFields.enumerate)
549                 {
550                     if (count > 0) hasher.put(delimArray);
551                     hasher.put(cast(ubyte[]) key);
552                 }
553                 hasher.finish;
554 
555                 static if (generateRandomAll)
556                 {
557                     import std.conv : to;
558                     outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec);
559                     outputStream.put(cmdopt.delim);
560                     outputStream.put(line);
561                     outputStream.put("\n");
562 
563                     if (cmdopt.sampleSize != 0)
564                     {
565                         ++numLinesWritten;
566                         if (numLinesWritten == cmdopt.sampleSize) return;
567                     }
568                 }
569                 else if (hasher.get % numBuckets == 0)
570                 {
571                     if (cmdopt.printRandom)
572                     {
573                         outputStream.put('0');
574                         outputStream.put(cmdopt.delim);
575                     }
576                     outputStream.put(line);
577                     outputStream.put("\n");
578 
579                     if (cmdopt.sampleSize != 0)
580                     {
581                         ++numLinesWritten;
582                         if (numLinesWritten == cmdopt.sampleSize) return;
583                     }
584                 }
585             }
586         }
587     }
588 }
589 
590 /** An implementation of reservior sampling. Both weighted and uniform random sampling
591  * are supported.
592  *
593  * Both weighted and uniform random sampling are implemented using the one-pass algorithm
594  * described by Pavlos Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data
595  * Streams", Pavlos S. Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted
596  * case weights are simply set to one.
597  *
598  * The implementation uses a heap (priority queue) large enough to hold the desired
599  * number of lines. Input is read line-by-line, assigned a random value, and added to the
600  * heap. The role of the identify the lines with the highest assigned random values. Once
601  * the heap is full, adding a new line means dropping the line with the lowest score. A
602  * "min" heap used for this reason.
603  *
604  * When done reading all lines, the "min" heap is in the opposite order needed for output.
605  * The desired order is obtained by removing each element one at at time from the heap.
606  * The underlying data store will have the elements in correct order.
607  *
608  * Generating output in weighted order matters for several reasons:
609  *  - For weighted sampling, it preserves the property that smaller valid subsets can be
610  *    created by taking the first N lines.
611  *  - For unweighted sampling, it ensures that all output permutations are possible, and
612  *    are not influences by input order or the heap data structure used.
613  *  - Order consistency when making repeated use of the same random seeds, but with
614  *    different sample sizes.
615  *
616  * There are use cases where only the selection set matters, for these some performance
617  * could be gained by skipping the reordering and simply printing the backing store
618  * array in-order, but making this distinction seems an unnecessary complication.
619  *
620  * Notes:
621  *  - In tsv-sample versions 1.2.1 and earlier this routine also supported randomization
622  *    of all input lines. This was dropped in version 1.2.2 in favor of the approach
623  *    used in randomizeLines. The latter has significant advantages given that all data
624  *    data must be read into memory.
625  *  - The unweighted case could be sped up by adopting what is commonly known as
626  *    "Algorithm R" followed by a random walk on the resulting reservoir (e.g.
627  *    std.random.randomCover in the D standard library). This is faster than reversing
628  *    the heap prior to output. The downsides are that the result order would not be
629  *    consistent with the other routines and that random number printing does not make
630  *    sense. Order consistency matters only in the rare case when multiple randomizations
631  *    are being done with the same static seed. For a description of Algorithm R see:
632  *    https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R.
633  */
634 void reservoirSampling(Flag!"isWeighted" isWeighted, OutputRange)
635     (TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
636 if (isOutputRange!(OutputRange, char))
637 {
638     import std.container.array;
639     import std.container.binaryheap;
640     import std.format : formatValue, singleSpec;
641     import std.random : Random, uniform01;
642     import tsvutil : throwIfWindowsNewlineOnUnix;
643 
644     static if (isWeighted) assert(cmdopt.hasWeightField);
645     else assert(!cmdopt.hasWeightField);
646 
647     assert(cmdopt.sampleSize > 0);
648 
649     auto randomGenerator = Random(cmdopt.seed);
650 
651     struct Entry
652     {
653         double score;
654         char[] line;
655     }
656 
657     /* Create the heap and backing data store.
658      *
659      * Note: An std.container.array is used as the backing store to avoid some issues in
660      * the standard library (Phobos) binaryheap implementation. Specifically, when an
661      * std.container.array is used as backing store, the heap can efficiently reversed by
662      * removing the heap elements. This leaves the backing store in the reversed order.
663      * However, the current binaryheap implementation does not support this for all
664      * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094.
665      */
666 
667     Array!Entry dataStore;
668     dataStore.reserve(cmdopt.sampleSize);
669     auto reservoir = dataStore.heapify!("a.score > b.score")(0);  // Min binaryheap
670 
671     /* Process each line. */
672     bool headerWritten = false;
673     foreach (filename; cmdopt.files)
674     {
675         auto inputStream = (filename == "-") ? stdin : filename.File();
676         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
677         {
678             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
679             if (fileLineNum == 1 && cmdopt.hasHeader)
680             {
681                 if (!headerWritten)
682                 {
683                     if (cmdopt.printRandom)
684                     {
685                         outputStream.put(cmdopt.randomValueHeader);
686                         outputStream.put(cmdopt.delim);
687                     }
688                     outputStream.put(line);
689                     outputStream.put("\n");
690                     headerWritten = true;
691                 }
692             }
693             else
694             {
695                 static if (!isWeighted)
696                 {
697                     double lineScore = uniform01(randomGenerator);
698                 }
699                 else
700                 {
701                     double lineWeight =
702                         getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, filename, fileLineNum);
703                     double lineScore =
704                         (lineWeight > 0.0)
705                         ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
706                         : 0.0;
707                 }
708 
709                 if (reservoir.length < cmdopt.sampleSize)
710                 {
711                     reservoir.insert(Entry(lineScore, line.dup));
712                 }
713                 else if (reservoir.front.score < lineScore)
714                 {
715                     reservoir.replaceFront(Entry(lineScore, line.dup));
716                 }
717             }
718         }
719     }
720 
721     /* All entries are in the reservoir. Time to print. The heap is in reverse order
722      * of assigned weights. Reversing order is done by removing all elements from the
723      * heap, this leaves the backing store in the correct order for output.
724      *
725      * The asserts here avoid issues with the current binaryheap implementation. They
726      * detect use of backing stores having a length not synchronized to the reservoir.
727      */
728     size_t numLines = reservoir.length;
729     assert(numLines == dataStore.length);
730 
731     while (!reservoir.empty) reservoir.removeFront;
732     assert(numLines == dataStore.length);
733 
734     immutable randomValueFormatSpec = singleSpec("%.17g");
735 
736     foreach (entry; dataStore)
737     {
738         if (cmdopt.printRandom)
739         {
740             outputStream.formatValue(entry.score, randomValueFormatSpec);
741             outputStream.put(cmdopt.delim);
742         }
743         outputStream.put(entry.line);
744         outputStream.put("\n");
745     }
746  }
747 
748 /** Generates weighted random values for all input lines, preserving input order.
749  *
750  * This complements weighted reservoir sampling, but instead of using a reservoir it
751  * simply iterates over the input lines generating the values. The weighted random
752  * values are generated with the same formula used by reservoirSampling.
753  */
754 void generateWeightedRandomValuesInorder(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
755 if (isOutputRange!(OutputRange, char))
756 {
757     import std.format : formatValue, singleSpec;
758     import std.random : Random, uniform01;
759     import tsvutil : throwIfWindowsNewlineOnUnix;
760 
761     assert(cmdopt.hasWeightField);
762 
763     auto randomGenerator = Random(cmdopt.seed);
764     immutable randomValueFormatSpec = singleSpec("%.17g");
765 
766     /* Process each line. */
767     bool headerWritten = false;
768     size_t numLinesWritten = 0;
769     foreach (filename; cmdopt.files)
770     {
771         auto inputStream = (filename == "-") ? stdin : filename.File();
772         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
773         {
774             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
775             if (fileLineNum == 1 && cmdopt.hasHeader)
776             {
777                 if (!headerWritten)
778                 {
779                     outputStream.put(cmdopt.randomValueHeader);
780                     outputStream.put(cmdopt.delim);
781                     outputStream.put(line);
782                     outputStream.put("\n");
783                     headerWritten = true;
784                 }
785             }
786             else
787                {
788                 double lineWeight = getFieldValue!double(line, cmdopt.weightField, cmdopt.delim,
789                                                          filename, fileLineNum);
790                 double lineScore =
791                     (lineWeight > 0.0)
792                     ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
793                     : 0.0;
794 
795                 outputStream.formatValue(lineScore, randomValueFormatSpec);
796                 outputStream.put(cmdopt.delim);
797                 outputStream.put(line);
798                 outputStream.put("\n");
799 
800                 if (cmdopt.sampleSize != 0)
801                 {
802                     ++numLinesWritten;
803                     if (numLinesWritten == cmdopt.sampleSize) return;
804                 }
805             }
806         }
807     }
808 }
809 
810 /** Randomize all the lines in files or standard input.
811  *
812  * All lines in files and/or standard input are read in and written out in random
813  * order. Both simple random sampling and weighted sampling are supported.
814  *
815  * Input data size is limited by available memory. Disk oriented techniques are needed
816  * when data sizes are larger. For example, generating random values line-by-line (ala
817  * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort.
818  *
819  * This approach is significantly faster than reading line-by-line with a heap the
820  * way reservoir sampling does, effectively acknowledging that both approaches
821  * need to read all data into memory when randomizing all lines.
822  *
823  * Note: The unweighted case could be sped up by using std.random.randomShuffle from
824  * the D standard library. This uses an O(n) swapping algorithm to perform the shuffle
825  * rather than the O(n log n) sort approach used here. The downsides are that the
826  * result order would not be consistent with the other routines and that random number
827  * printing does not make sense. Order consistency matters only in the rare case when
828  * multiple randomizations are being done with the same static seed.
829  */
830 void randomizeLines(Flag!"isWeighted" isWeighted, OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
831 if (isOutputRange!(OutputRange, char))
832 {
833     import std.algorithm : min, sort, splitter;
834     import std.array : appender;
835     import std.format : formatValue, singleSpec;
836     import std.random : Random, uniform01;
837     import tsvutil : throwIfWindowsNewlineOnUnix;
838 
839     static if (isWeighted) assert(cmdopt.hasWeightField);
840     else assert(!cmdopt.hasWeightField);
841 
842     assert(cmdopt.sampleSize == 0);
843 
844     struct FileData
845     {
846         string filename;
847         char[] data;
848     }
849 
850     auto fileData = new FileData[cmdopt.files.length];
851 
852     /*
853      * Read all file data into memory.
854      */
855     ubyte[1024 * 128] fileRawBuf;
856     foreach (fileNum, filename; cmdopt.files)
857     {
858         fileData[fileNum].filename = filename;
859         auto dataAppender = appender(&(fileData[fileNum].data));
860         auto ifile = (filename == "-") ? stdin : filename.File;
861 
862         if (filename != "-")
863         {
864             ulong filesize = ifile.size;
865             if (filesize < ulong.max) dataAppender.reserve(min(filesize, size_t.max));
866         }
867 
868         foreach (ref ubyte[] buffer; ifile.byChunk(fileRawBuf)) dataAppender.put(cast(char[]) buffer);
869     }
870 
871     /*
872      * Split the data into lines and assign a random value to each line.
873      */
874     struct Entry
875     {
876         double score;
877         char[] line;
878     }
879 
880     auto scoredLines = appender!(Entry[]);
881     auto randomGenerator = Random(cmdopt.seed);
882     bool headerWritten = false;
883 
884     foreach (fd; fileData)
885     {
886         /* Drop the last newline to avoid adding an extra empty line. */
887         auto data = (fd.data.length > 0 && fd.data[$ - 1] == '\n') ? fd.data[0 .. $ - 1] : fd.data;
888         foreach (fileLineNum, line; data.splitter('\n').enumerate(1))
889         {
890             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, fd.filename, fileLineNum);
891             if (fileLineNum == 1 && cmdopt.hasHeader)
892             {
893                 if (!headerWritten)
894                 {
895                     if (cmdopt.printRandom)
896                     {
897                         outputStream.put(cmdopt.randomValueHeader);
898                         outputStream.put(cmdopt.delim);
899                     }
900                     outputStream.put(line);
901                     outputStream.put("\n");
902                     headerWritten = true;
903                 }
904             }
905             else
906             {
907                 static if (!isWeighted)
908                 {
909                     double lineScore = uniform01(randomGenerator);
910                 }
911                 else
912                 {
913                     double lineWeight =
914                         getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, fd.filename, fileLineNum);
915                     double lineScore =
916                         (lineWeight > 0.0)
917                         ? uniform01(randomGenerator) ^^ (1.0 / lineWeight)
918                         : 0.0;
919                 }
920 
921                 scoredLines.put(Entry(lineScore, line));
922             }
923         }
924     }
925 
926     /*
927      * Sort by the weight and output the lines.
928      */
929     scoredLines.data.sort!((a, b) => a.score > b.score);
930 
931     immutable randomValueFormatSpec = singleSpec("%.17g");
932 
933     foreach (lineEntry; scoredLines.data)
934     {
935         if (cmdopt.printRandom)
936         {
937             outputStream.formatValue(lineEntry.score, randomValueFormatSpec);
938             outputStream.put(cmdopt.delim);
939         }
940         outputStream.put(lineEntry.line);
941         outputStream.put("\n");
942     }
943 }
944 
945 /** Simple random sampling with replacement.
946  *
947  * All lines in files and/or standard input are read in. Then random lines are selected
948  * one at a time and output. Lines can be selected multiple times. This process continues
949  * until the desired number of samples (--n|num) has been output. Output continues
950  * indefinitely if a sample size was not provided.
951  */
952 void simpleRandomSamplingWithReplacement(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream)
953 if (isOutputRange!(OutputRange, char))
954 {
955     import std.algorithm : each, min, sort, splitter;
956     import std.array : appender;
957     import std.format : formatValue, singleSpec;
958     import std.random : Random, uniform;
959     import tsvutil : throwIfWindowsNewlineOnUnix;
960 
961     struct FileData
962     {
963         string filename;
964         char[] data;
965     }
966 
967     auto fileData = new FileData[cmdopt.files.length];
968 
969     /*
970      * Read all file data into memory.
971      */
972     ubyte[1024 * 128] fileRawBuf;
973     foreach (fileNum, filename; cmdopt.files)
974     {
975         fileData[fileNum].filename = filename;
976         auto dataAppender = appender(&(fileData[fileNum].data));
977         auto ifile = (filename == "-") ? stdin : filename.File;
978 
979         if (filename != "-")
980         {
981             ulong filesize = ifile.size;
982             if (filesize < ulong.max) dataAppender.reserve(min(filesize, size_t.max));
983         }
984 
985         foreach (ref ubyte[] buffer; ifile.byChunk(fileRawBuf)) dataAppender.put(cast(char[]) buffer);
986     }
987 
988     /*
989      * Split the data into lines.
990      */
991     struct Entry
992     {
993         char[] line;
994     }
995 
996     auto lines = appender!(Entry[]);
997     bool headerWritten = false;
998 
999     foreach (fd; fileData)
1000     {
1001         /* Drop the last newline to avoid adding an extra empty line. */
1002         auto data = (fd.data.length > 0 && fd.data[$ - 1] == '\n') ? fd.data[0 .. $ - 1] : fd.data;
1003         foreach (fileLineNum, line; data.splitter('\n').enumerate(1))
1004         {
1005             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, fd.filename, fileLineNum);
1006             if (fileLineNum == 1 && cmdopt.hasHeader)
1007             {
1008                 if (!headerWritten)
1009                 {
1010                     outputStream.put(line);
1011                     outputStream.put("\n");
1012                     headerWritten = true;
1013                 }
1014             }
1015             else
1016             {
1017                 lines.put(Entry(line));
1018             }
1019         }
1020     }
1021 
1022     if (lines.data.length > 0)
1023     {
1024         auto randomGenerator = Random(cmdopt.seed);
1025 
1026         /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */
1027         size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize;
1028         while (numLeft != 0)
1029         {
1030             size_t index = uniform(0, lines.data.length, randomGenerator);
1031             outputStream.put(lines.data[index].line);
1032             outputStream.put("\n");
1033             if (cmdopt.sampleSize != 0) numLeft--;
1034         }
1035     }
1036 }
1037 
1038 
1039 /** Convenience function for extracting a single field from a line. See getTsvFieldValue in
1040  * common/src/tsvutils.d for details. This wrapper creates error text tailored for this program.
1041  */
1042 import std.traits : isSomeChar;
1043 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, size_t lineNum) pure @safe
1044 if (isSomeChar!C)
1045 {
1046     import std.conv : ConvException, to;
1047     import std.format : format;
1048     import tsvutil : getTsvFieldValue;
1049 
1050     T val;
1051     try
1052     {
1053         val = getTsvFieldValue!T(line, fieldIndex, delim);
1054     }
1055     catch (ConvException exc)
1056     {
1057         throw new Exception(
1058             format("Could not process line: %s\n  File: %s Line: %s%s",
1059                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
1060                    (lineNum == 1) ? "\n  Is this a header line? Use --H|header to skip." : ""));
1061     }
1062     catch (Exception exc)
1063     {
1064         /* Not enough fields on the line. */
1065         throw new Exception(
1066             format("Could not process line: %s\n  File: %s Line: %s",
1067                    exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum));
1068     }
1069 
1070     return val;
1071 }
1072 
1073 unittest
1074 {
1075     /* getFieldValue unit tests. getTsvFieldValue has it's own tests.
1076      * These tests make basic sanity checks on the getFieldValue wrapper.
1077      */
1078     import std.exception;
1079 
1080     assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123);
1081     assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4);
1082     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1));
1083     assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2));
1084     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1));
1085     assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2));
1086 }
1087 
1088 /* Unit tests for the main program start here.
1089  *
1090  * Portability note: Many of the tests here rely on generating consistent random numbers
1091  * across different platforms when using the same random seed. So far this has succeeded
1092  * on several different platorm, compiler, and library versions. However, it is certainly
1093  * possible this condition will not hold on other platforms.
1094  *
1095  * For tsv-sample, this portability implies generating the same results on different
1096  * platforms when using the same random seed. This is NOT part of tsv-sample guarantees,
1097  * but it is convenient for testing. If platforms are identified that do not generate
1098  * the same results these tests will need to be adjusted.
1099  */
1100 version(unittest)
1101 {
1102     /* Unit test helper functions. */
1103 
1104     import unittest_utils;   // tsv unit test helpers, from common/src/.
1105     import std.conv : to;
1106 
1107     void testTsvSample(string[] cmdArgs, string[][] expected)
1108     {
1109         import std.array : appender;
1110         import std.format : format;
1111 
1112         assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty.");
1113 
1114         auto formatAssertMessage(T...)(string msg, T formatArgs)
1115         {
1116             auto formatString = "[testTsvSample] %s: " ~ msg;
1117             return format(formatString, cmdArgs[0], formatArgs);
1118         }
1119 
1120         TsvSampleOptions cmdopt;
1121         auto savedCmdArgs = cmdArgs.to!string;
1122         auto r = cmdopt.processArgs(cmdArgs);
1123         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
1124         auto output = appender!(char[])();
1125 
1126         tsvSample(cmdopt, output);    // This invokes the main code line.
1127 
1128         auto expectedOutput = expected.tsvDataToString;
1129 
1130         assert(output.data == expectedOutput,
1131                formatAssertMessage(
1132                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
1133                    expectedOutput.to!string, output.data.to!string));
1134     }
1135  }
1136 
1137 unittest
1138 {
1139     import std.path : buildPath;
1140     import std.file : rmdirRecurse;
1141     import std.format : format;
1142 
1143     auto testDir = makeUnittestTempDir("tsv_sample");
1144     scope(exit) testDir.rmdirRecurse;
1145 
1146     /* Tabular data sets and expected results use the built-in static seed.
1147      * Tests are run by writing the data set to a file, then calling the main
1148      * routine to process. The function testTsvSample plays the role of the
1149      * main program. Rather than writing to expected output, the results are
1150      * matched against expected. The expected results were verified by hand
1151      * prior to inclusion in the test.
1152      *
1153      * The initial part of this section is simply setting up data files and
1154      * expected results.
1155      */
1156 
1157     /* Empty file. */
1158     string[][] dataEmpty = [];
1159     string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv");
1160     writeUnittestTsvFile(fpath_dataEmpty, dataEmpty);
1161 
1162     /* 3x1, header only. */
1163     string[][] data3x0 = [["field_a", "field_b", "field_c"]];
1164     string fpath_data3x0 = buildPath(testDir, "data3x0.tsv");
1165     writeUnittestTsvFile(fpath_data3x0, data3x0);
1166 
1167     /* 3x1 */
1168     string[][] data3x1 =
1169         [["field_a", "field_b", "field_c"],
1170          ["tan", "タン", "8.5"]];
1171 
1172     string fpath_data3x1 = buildPath(testDir, "data3x1.tsv");
1173     string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv");
1174     writeUnittestTsvFile(fpath_data3x1, data3x1);
1175     writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1..$]);
1176 
1177     string[][] data3x1ExpectedReplace3 =
1178         [["field_a", "field_b", "field_c"],
1179          ["tan", "タン", "8.5"],
1180          ["tan", "タン", "8.5"],
1181          ["tan", "タン", "8.5"]];
1182 
1183     /* 3x2 */
1184     string[][] data3x2 =
1185         [["field_a", "field_b", "field_c"],
1186          ["brown", "褐色", "29.2"],
1187          ["gray", "グレー", "6.2"]];
1188 
1189     string fpath_data3x2 = buildPath(testDir, "data3x2.tsv");
1190     string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv");
1191     writeUnittestTsvFile(fpath_data3x2, data3x2);
1192     writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1..$]);
1193 
1194     string[][] data3x2ExpectedNoWt =
1195         [["field_a", "field_b", "field_c"],
1196          ["gray", "グレー", "6.2"],
1197          ["brown", "褐色", "29.2"]];
1198 
1199     /* 3x3 */
1200     string[][] data3x3 =
1201         [["field_a", "field_b", "field_c"],
1202          ["orange", "オレンジ", "2.5"],
1203          ["pink", "ピンク", "1.1"],
1204          ["purple", "紫の", "42"]];
1205 
1206     string fpath_data3x3 = buildPath(testDir, "data3x3.tsv");
1207     string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv");
1208     writeUnittestTsvFile(fpath_data3x3, data3x3);
1209     writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1..$]);
1210 
1211     string[][] data3x3ExpectedNoWt =
1212         [["field_a", "field_b", "field_c"],
1213          ["purple", "紫の", "42"],
1214          ["pink", "ピンク", "1.1"],
1215          ["orange", "オレンジ", "2.5"]];
1216 
1217     /* 3x6 */
1218     string[][] data3x6 =
1219         [["field_a", "field_b", "field_c"],
1220          ["red", "赤", "23.8"],
1221          ["green", "緑", "0.0072"],
1222          ["white", "白", "1.65"],
1223          ["yellow", "黄", "12"],
1224          ["blue", "青", "12"],
1225          ["black", "黒", "0.983"]];
1226     string fpath_data3x6 = buildPath(testDir, "data3x6.tsv");
1227     string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv");
1228     writeUnittestTsvFile(fpath_data3x6, data3x6);
1229     writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1..$]);
1230 
1231     string[][] data3x6ExpectedNoWt =
1232         [["field_a", "field_b", "field_c"],
1233          ["yellow", "黄", "12"],
1234          ["black", "黒", "0.983"],
1235          ["blue", "青", "12"],
1236          ["white", "白", "1.65"],
1237          ["green", "緑", "0.0072"],
1238          ["red", "赤", "23.8"]];
1239 
1240     string[][] data3x6ExpectedNoWtProbs =
1241         [["random_value", "field_a", "field_b", "field_c"],
1242          ["0.96055546286515892", "yellow", "黄", "12"],
1243          ["0.7571015392895788", "black", "黒", "0.983"],
1244          ["0.52525980887003243", "blue", "青", "12"],
1245          ["0.49287854949943721", "white", "白", "1.65"],
1246          ["0.15929344086907804", "green", "緑", "0.0072"],
1247          ["0.010968807619065046", "red", "赤", "23.8"]];
1248 
1249     string[][] data3x6ExpectedProbsBernoulliSampleP100 =
1250         [["random_value", "field_a", "field_b", "field_c"],
1251          ["0.010968807619065046", "red", "赤", "23.8"],
1252          ["0.15929344086907804", "green", "緑", "0.0072"],
1253          ["0.49287854949943721", "white", "白", "1.65"],
1254          ["0.96055546286515892", "yellow", "黄", "12"],
1255          ["0.52525980887003243", "blue", "青", "12"],
1256          ["0.7571015392895788", "black", "黒", "0.983"]];
1257 
1258     string[][] data3x6ExpectedProbsBernoulliSampleP60 =
1259         [["random_value", "field_a", "field_b", "field_c"],
1260          ["0.010968807619065046", "red", "赤", "23.8"],
1261          ["0.15929344086907804", "green", "緑", "0.0072"],
1262          ["0.49287854949943721", "white", "白", "1.65"],
1263          ["0.52525980887003243", "blue", "青", "12"]];
1264 
1265     string[][] data3x6ExpectedBernoulliSampleP60 =
1266         [["field_a", "field_b", "field_c"],
1267          ["red", "赤", "23.8"],
1268          ["green", "緑", "0.0072"],
1269          ["white", "白", "1.65"],
1270          ["blue", "青", "12"]];
1271 
1272     string[][] data3x6ExpectedDistinctSampleK1K3P60 =
1273         [["field_a", "field_b", "field_c"],
1274          ["green", "緑", "0.0072"],
1275          ["white", "白", "1.65"],
1276          ["blue", "青", "12"]];
1277 
1278     string[][] data3x6ExpectedDistinctSampleK1K3P60Probs =
1279         [["random_value", "field_a", "field_b", "field_c"],
1280          ["0", "green", "緑", "0.0072"],
1281          ["0", "white", "白", "1.65"],
1282          ["0", "blue", "青", "12"]];
1283 
1284     string[][] data3x6ExpectedDistinctSampleK1K3P60ProbsRVCustom =
1285         [["custom_random_value_header", "field_a", "field_b", "field_c"],
1286          ["0", "green", "緑", "0.0072"],
1287          ["0", "white", "白", "1.65"],
1288          ["0", "blue", "青", "12"]];
1289 
1290     string[][] data3x6ExpectedDistinctSampleK2P2ProbsInorder =
1291         [["random_value", "field_a", "field_b", "field_c"],
1292          ["1", "red", "赤", "23.8"],
1293          ["0", "green", "緑", "0.0072"],
1294          ["0", "white", "白", "1.65"],
1295          ["1", "yellow", "黄", "12"],
1296          ["3", "blue", "青", "12"],
1297          ["2", "black", "黒", "0.983"]];
1298 
1299     string[][] data3x6ExpectedWt3Probs =
1300         [["random_value", "field_a", "field_b", "field_c"],
1301          ["0.9966519875764539", "yellow", "黄", "12"],
1302          ["0.94775884809836686", "blue", "青", "12"],
1303          ["0.82728234682286661", "red", "赤", "23.8"],
1304          ["0.75346697377181959", "black", "黒", "0.983"],
1305          ["0.65130103496422487", "white", "白", "1.65"],
1306          ["1.5636943712879866e-111", "green", "緑", "0.0072"]];
1307 
1308     string[][] data3x6ExpectedWt3ProbsInorder =
1309         [["random_value", "field_a", "field_b", "field_c"],
1310          ["0.82728234682286661", "red", "赤", "23.8"],
1311          ["1.5636943712879866e-111", "green", "緑", "0.0072"],
1312          ["0.65130103496422487", "white", "白", "1.65"],
1313          ["0.9966519875764539", "yellow", "黄", "12"],
1314          ["0.94775884809836686", "blue", "青", "12"],
1315          ["0.75346697377181959", "black", "黒", "0.983"]];
1316 
1317     string[][] data3x6ExpectedWt3 =
1318         [["field_a", "field_b", "field_c"],
1319          ["yellow", "黄", "12"],
1320          ["blue", "青", "12"],
1321          ["red", "赤", "23.8"],
1322          ["black", "黒", "0.983"],
1323          ["white", "白", "1.65"],
1324          ["green", "緑", "0.0072"]];
1325 
1326     string[][] data3x6ExpectedReplace10 =
1327         [["field_a", "field_b", "field_c"],
1328          ["black", "黒", "0.983"],
1329          ["green", "緑", "0.0072"],
1330          ["green", "緑", "0.0072"],
1331          ["red", "赤", "23.8"],
1332          ["yellow", "黄", "12"],
1333          ["red", "赤", "23.8"],
1334          ["white", "白", "1.65"],
1335          ["yellow", "黄", "12"],
1336          ["yellow", "黄", "12"],
1337          ["white", "白", "1.65"],
1338         ];
1339 
1340     string[][] data3x6ExpectedReplace10V77 =
1341         [["field_a", "field_b", "field_c"],
1342          ["black", "黒", "0.983"],
1343          ["red", "赤", "23.8"],
1344          ["black", "黒", "0.983"],
1345          ["yellow", "黄", "12"],
1346          ["green", "緑", "0.0072"],
1347          ["green", "緑", "0.0072"],
1348          ["green", "緑", "0.0072"],
1349          ["yellow", "黄", "12"],
1350          ["blue", "青", "12"],
1351          ["white", "白", "1.65"],
1352         ];
1353 
1354     /* Using a different static seed. */
1355     string[][] data3x6ExpectedNoWtV41Probs =
1356         [["random_value", "field_a", "field_b", "field_c"],
1357          ["0.68057272653095424", "green", "緑", "0.0072"],
1358          ["0.67681624367833138", "blue", "青", "12"],
1359          ["0.32097338931635022", "yellow", "黄", "12"],
1360          ["0.25092361867427826", "red", "赤", "23.8"],
1361          ["0.15535934292711318", "black", "黒", "0.983"],
1362          ["0.04609582107514143", "white", "白", "1.65"]];
1363 
1364     string[][] data3x6ExpectedV41ProbsBernoulliSampleP60 =
1365         [["random_value", "field_a", "field_b", "field_c"],
1366          ["0.25092361867427826", "red", "赤", "23.8"],
1367          ["0.04609582107514143", "white", "白", "1.65"],
1368          ["0.32097338931635022", "yellow", "黄", "12"],
1369          ["0.15535934292711318", "black", "黒", "0.983"]];
1370 
1371     string[][] data3x6ExpectedWt3V41Probs =
1372         [["random_value", "field_a", "field_b", "field_c"],
1373          ["0.96799377498910666", "blue", "青", "12"],
1374          ["0.94356245792573568", "red", "赤", "23.8"],
1375          ["0.90964601024271996", "yellow", "黄", "12"],
1376          ["0.15491658409260103", "white", "白", "1.65"],
1377          ["0.15043620392537033", "black", "黒", "0.983"],
1378          ["6.1394674830701461e-24", "green", "緑", "0.0072"]];
1379 
1380     string[][] data3x6ExpectedWt3V41ProbsInorder =
1381         [["random_value", "field_a", "field_b", "field_c"],
1382          ["0.94356245792573568", "red", "赤", "23.8"],
1383          ["6.1394674830701461e-24", "green", "緑", "0.0072"],
1384          ["0.15491658409260103", "white", "白", "1.65"],
1385          ["0.90964601024271996", "yellow", "黄", "12"],
1386          ["0.96799377498910666", "blue", "青", "12"],
1387          ["0.15043620392537033", "black", "黒", "0.983"]];
1388 
1389 
1390     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
1391     string[][] combo1ExpectedNoWt =
1392         [["field_a", "field_b", "field_c"],
1393          ["yellow", "黄", "12"],
1394          ["tan", "タン", "8.5"],
1395          ["brown", "褐色", "29.2"],
1396          ["green", "緑", "0.0072"],
1397          ["red", "赤", "23.8"],
1398          ["purple", "紫の", "42"],
1399          ["black", "黒", "0.983"],
1400          ["white", "白", "1.65"],
1401          ["gray", "グレー", "6.2"],
1402          ["blue", "青", "12"],
1403          ["pink", "ピンク", "1.1"],
1404          ["orange", "オレンジ", "2.5"]];
1405 
1406     string[][] combo1ExpectedNoWtProbs =
1407         [["random_value", "field_a", "field_b", "field_c"],
1408          ["0.97088520275428891", "yellow", "黄", "12"],
1409          ["0.96055546286515892", "tan", "タン", "8.5"],
1410          ["0.81756894313730299", "brown", "褐色", "29.2"],
1411          ["0.7571015392895788", "green", "緑", "0.0072"],
1412          ["0.52525980887003243", "red", "赤", "23.8"],
1413          ["0.49287854949943721", "purple", "紫の", "42"],
1414          ["0.47081507067196071", "black", "黒", "0.983"],
1415          ["0.38388182921335101", "white", "白", "1.65"],
1416          ["0.29215990612283349", "gray", "グレー", "6.2"],
1417          ["0.24033216014504433", "blue", "青", "12"],
1418          ["0.15929344086907804", "pink", "ピンク", "1.1"],
1419          ["0.010968807619065046", "orange", "オレンジ", "2.5"]];
1420 
1421     /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */
1422     string[][] combo1ExpectedNoWtProbsInorder =
1423         [["random_value", "field_a", "field_b", "field_c"],
1424          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
1425          ["0.15929344086907804", "pink", "ピンク", "1.1"],
1426          ["0.49287854949943721", "purple", "紫の", "42"],
1427          ["0.96055546286515892", "tan", "タン", "8.5"],
1428          ["0.52525980887003243", "red", "赤", "23.8"],
1429          ["0.7571015392895788", "green", "緑", "0.0072"],
1430          ["0.38388182921335101", "white", "白", "1.65"],
1431          ["0.97088520275428891", "yellow", "黄", "12"],
1432          ["0.24033216014504433", "blue", "青", "12"],
1433          ["0.47081507067196071", "black", "黒", "0.983"],
1434          ["0.81756894313730299", "brown", "褐色", "29.2"],
1435          ["0.29215990612283349", "gray", "グレー", "6.2"]];
1436 
1437     string[][] combo1ExpectedProbsBernoulliSampleP50 =
1438         [["random_value", "field_a", "field_b", "field_c"],
1439          ["0.010968807619065046", "orange", "オレンジ", "2.5"],
1440          ["0.15929344086907804", "pink", "ピンク", "1.1"],
1441          ["0.49287854949943721", "purple", "紫の", "42"],
1442          ["0.38388182921335101", "white", "白", "1.65"],
1443          ["0.24033216014504433", "blue", "青", "12"],
1444          ["0.47081507067196071", "black", "黒", "0.983"],
1445          ["0.29215990612283349", "gray", "グレー", "6.2"]];
1446 
1447     string[][] combo1ExpectedBernoulliSampleP40 =
1448         [["field_a", "field_b", "field_c"],
1449          ["orange", "オレンジ", "2.5"],
1450          ["pink", "ピンク", "1.1"],
1451          ["white", "白", "1.65"],
1452          ["blue", "青", "12"],
1453          ["gray", "グレー", "6.2"]];
1454 
1455     string[][] combo1ExpectedDistinctSampleK1P40 =
1456         [["field_a", "field_b", "field_c"],
1457          ["orange", "オレンジ", "2.5"],
1458          ["red", "赤", "23.8"],
1459          ["green", "緑", "0.0072"],
1460          ["blue", "青", "12"],
1461          ["black", "黒", "0.983"]];
1462 
1463     string[][] combo1ExpectedWt3Probs =
1464         [["random_value", "field_a", "field_b", "field_c"],
1465          ["0.99754077523718754", "yellow", "黄", "12"],
1466          ["0.99527665440088786", "tan", "タン", "8.5"],
1467          ["0.99312578945741659", "brown", "褐色", "29.2"],
1468          ["0.98329602553389361", "purple", "紫の", "42"],
1469          ["0.9733096193808366", "red", "赤", "23.8"],
1470          ["0.88797551521739648", "blue", "青", "12"],
1471          ["0.81999230489041786", "gray", "グレー", "6.2"],
1472          ["0.55975569204250941", "white", "白", "1.65"],
1473          ["0.46472135609205739", "black", "黒", "0.983"],
1474          ["0.18824582704191337", "pink", "ピンク", "1.1"],
1475          ["0.1644613185329992", "orange", "オレンジ", "2.5"],
1476          ["1.6438086931020549e-17", "green", "緑", "0.0072"]];
1477 
1478     string[][] combo1ExpectedWt3 =
1479         [["field_a", "field_b", "field_c"],
1480          ["yellow", "黄", "12"],
1481          ["tan", "タン", "8.5"],
1482          ["brown", "褐色", "29.2"],
1483          ["purple", "紫の", "42"],
1484          ["red", "赤", "23.8"],
1485          ["blue", "青", "12"],
1486          ["gray", "グレー", "6.2"],
1487          ["white", "白", "1.65"],
1488          ["black", "黒", "0.983"],
1489          ["pink", "ピンク", "1.1"],
1490          ["orange", "オレンジ", "2.5"],
1491          ["green", "緑", "0.0072"]];
1492 
1493     string[][] combo1ExpectedReplace10 =
1494         [["field_a", "field_b", "field_c"],
1495          ["gray", "グレー", "6.2"],
1496          ["yellow", "黄", "12"],
1497          ["yellow", "黄", "12"],
1498          ["white", "白", "1.65"],
1499          ["tan", "タン", "8.5"],
1500          ["white", "白", "1.65"],
1501          ["blue", "青", "12"],
1502          ["black", "黒", "0.983"],
1503          ["tan", "タン", "8.5"],
1504          ["purple", "紫の", "42"]];
1505 
1506     /* 1x10 - Simple 1-column file. */
1507     string[][] data1x10 =
1508         [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]];
1509     string fpath_data1x10 = buildPath(testDir, "data1x10.tsv");
1510     string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv");
1511     writeUnittestTsvFile(fpath_data1x10, data1x10);
1512     writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1..$]);
1513 
1514     string[][] data1x10ExpectedNoWt =
1515         [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]];
1516 
1517     string[][] data1x10ExpectedWt1 =
1518         [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]];
1519 
1520     /* 2x10a - Uniform distribution [0,1]. */
1521     string[][] data2x10a =
1522         [["line", "weight"],
1523          ["1", "0.26788837"],
1524          ["2", "0.06601298"],
1525          ["3", "0.38627527"],
1526          ["4", "0.47379424"],
1527          ["5", "0.02966641"],
1528          ["6", "0.05636231"],
1529          ["7", "0.70529242"],
1530          ["8", "0.91836862"],
1531          ["9", "0.99103720"],
1532          ["10", "0.31401740"]];
1533 
1534     string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv");
1535     writeUnittestTsvFile(fpath_data2x10a, data2x10a);
1536 
1537     string[][] data2x10aExpectedWt2Probs =
1538         [["random_value", "line", "weight"],
1539          ["0.96833865494543658", "8", "0.91836862"],
1540          ["0.91856842054413923", "4", "0.47379424"],
1541          ["0.25730832087795091", "7", "0.70529242"],
1542          ["0.2372531790701812", "9", "0.99103720"],
1543          ["0.16016096701872204", "3", "0.38627527"],
1544          ["0.090819662667243381", "10", "0.31401740"],
1545          ["0.0071764539244361172", "6", "0.05636231"],
1546          ["4.8318642951630057e-08", "1", "0.26788837"],
1547          ["3.7525692966535517e-10", "5", "0.02966641"],
1548          ["8.2123247880095796e-13", "2", "0.06601298"]];
1549 
1550     /* 2x10b - Uniform distribution [0,1000]. */
1551     string[][] data2x10b =
1552         [["line", "weight"],
1553          ["1", "761"],
1554          ["2", "432"],
1555          ["3", "103"],
1556          ["4", "448"],
1557          ["5", "750"],
1558          ["6", "711"],
1559          ["7", "867"],
1560          ["8", "841"],
1561          ["9", "963"],
1562          ["10", "784"]];
1563 
1564     string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv");
1565     writeUnittestTsvFile(fpath_data2x10b, data2x10b);
1566 
1567     string[][] data2x10bExpectedWt2Probs =
1568         [["random_value", "line", "weight"],
1569          ["0.99996486739067969", "8", "841"],
1570          ["0.99991017467137211", "4", "448"],
1571          ["0.99960871524873662", "6", "711"],
1572          ["0.999141885371438", "5", "750"],
1573          ["0.99903963250274785", "10", "784"],
1574          ["0.99889631825931946", "7", "867"],
1575          ["0.99852058315191139", "9", "963"],
1576          ["0.99575669679158918", "2", "432"],
1577          ["0.99408758732050595", "1", "761"],
1578          ["0.99315467761212362", "3", "103"]];
1579 
1580     /* 2x10c - Logarithmic distribution in random order. */
1581     string[][] data2x10c =
1582         [["line", "weight"],
1583          ["1", "31.85"],
1584          ["2", "17403.31"],
1585          ["3", "653.84"],
1586          ["4", "8.23"],
1587          ["5", "2671.04"],
1588          ["6", "26226.08"],
1589          ["7", "1.79"],
1590          ["8", "354.56"],
1591          ["9", "35213.81"],
1592          ["10", "679.29"]];
1593 
1594     string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv");
1595     writeUnittestTsvFile(fpath_data2x10c, data2x10c);
1596 
1597     string[][] data2x10cExpectedWt2Probs =
1598         [["random_value", "line", "weight"],
1599          ["0.99998939008709697", "6", "26226.08"],
1600          ["0.99995951291695517", "9", "35213.81"],
1601          ["0.99991666907613541", "8", "354.56"],
1602          ["0.9998944505218641", "2", "17403.31"],
1603          ["0.9997589760286163", "5", "2671.04"],
1604          ["0.99891852769877643", "3", "653.84"],
1605          ["0.99889167752782515", "10", "679.29"],
1606          ["0.99512207506850148", "4", "8.23"],
1607          ["0.86789371584259023", "1", "31.85"],
1608          ["0.5857443816291561", "7", "1.79"]];
1609 
1610     /* 2x10d. Logarithmic distribution in ascending order. */
1611     string[][] data2x10d =
1612         [["line", "weight"],
1613          ["1", "1.79"],
1614          ["2", "8.23"],
1615          ["3", "31.85"],
1616          ["4", "354.56"],
1617          ["5", "653.84"],
1618          ["6", "679.29"],
1619          ["7", "2671.04"],
1620          ["8", "17403.31"],
1621          ["9", "26226.08"],
1622          ["10", "35213.81"]];
1623 
1624     string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv");
1625     writeUnittestTsvFile(fpath_data2x10d, data2x10d);
1626 
1627     string[][] data2x10dExpectedWt2Probs =
1628         [["random_value", "line", "weight"],
1629          ["0.99999830221846353", "8", "17403.31"],
1630          ["0.99997860834041397", "10", "35213.81"],
1631          ["0.99994563828986716", "9", "26226.08"],
1632          ["0.99988650363575737", "4", "354.56"],
1633          ["0.99964161939190088", "7", "2671.04"],
1634          ["0.99959045338948649", "6", "679.29"],
1635          ["0.99901574490639788", "5", "653.84"],
1636          ["0.97803163304747431", "3", "31.85"],
1637          ["0.79994791806910948", "2", "8.23"],
1638          ["0.080374261239949119", "1", "1.79"]];
1639 
1640     /* 2x10e. Logarithmic distribution in descending order. */
1641     string[][] data2x10e =
1642         [["line", "weight"],
1643          ["1", "35213.81"],
1644          ["2", "26226.08"],
1645          ["3", "17403.31"],
1646          ["4", "2671.04"],
1647          ["5", "679.29"],
1648          ["6", "653.84"],
1649          ["7", "354.56"],
1650          ["8", "31.85"],
1651          ["9", "8.23"],
1652          ["10", "1.79"]];
1653     string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv");
1654     writeUnittestTsvFile(fpath_data2x10e, data2x10e);
1655 
1656     string[][] data2x10eExpectedWt2Probs =
1657         [["random_value", "line", "weight"],
1658          ["0.99998493348975237", "4", "2671.04"],
1659          ["0.99995934807202624", "3", "17403.31"],
1660          ["0.99992995739727453", "2", "26226.08"],
1661          ["0.99987185679245649", "1", "35213.81"],
1662          ["0.99957451563173938", "6", "653.84"],
1663          ["0.99907273650209583", "8", "31.85"],
1664          ["0.99905260312968946", "5", "679.29"],
1665          ["0.99730333650516401", "7", "354.56"],
1666          ["0.84093902435227808", "9", "8.23"],
1667          ["0.65650015926290028", "10", "1.79"]];
1668 
1669     /* Data sets for distinct sampling. */
1670     string[][] data5x25 =
1671         [["ID", "Shape", "Color", "Size", "Weight"],
1672          ["01", "circle", "red", "S", "10"],
1673          ["02", "circle", "black", "L", "20"],
1674          ["03", "square", "black", "L", "20"],
1675          ["04", "circle", "green", "L", "30"],
1676          ["05", "ellipse", "red", "S", "20"],
1677          ["06", "triangle", "red", "S", "10"],
1678          ["07", "triangle", "red", "L", "20"],
1679          ["08", "square", "black", "S", "10"],
1680          ["09", "circle", "black", "S", "20"],
1681          ["10", "square", "green", "L", "20"],
1682          ["11", "triangle", "red", "L", "20"],
1683          ["12", "circle", "green", "L", "30"],
1684          ["13", "ellipse", "red", "S", "20"],
1685          ["14", "circle", "green", "L", "30"],
1686          ["15", "ellipse", "red", "L", "30"],
1687          ["16", "square", "red", "S", "10"],
1688          ["17", "circle", "black", "L", "20"],
1689          ["18", "square", "red", "S", "20"],
1690          ["19", "square", "black", "L", "20"],
1691          ["20", "circle", "red", "S", "10"],
1692          ["21", "ellipse", "black", "L", "30"],
1693          ["22", "triangle", "red", "L", "30"],
1694          ["23", "circle", "green", "S", "20"],
1695          ["24", "square", "green", "L", "20"],
1696          ["25", "circle", "red", "S", "10"],
1697             ];
1698 
1699     string fpath_data5x25 = buildPath(testDir, "data5x25.tsv");
1700     string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv");
1701     writeUnittestTsvFile(fpath_data5x25, data5x25);
1702     writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1..$]);
1703 
1704     string[][] data5x25ExpectedDistinctSampleK2P40 =
1705         [["ID", "Shape", "Color", "Size", "Weight"],
1706          ["03", "square", "black", "L", "20"],
1707          ["05", "ellipse", "red", "S", "20"],
1708          ["08", "square", "black", "S", "10"],
1709          ["10", "square", "green", "L", "20"],
1710          ["13", "ellipse", "red", "S", "20"],
1711          ["15", "ellipse", "red", "L", "30"],
1712          ["16", "square", "red", "S", "10"],
1713          ["18", "square", "red", "S", "20"],
1714          ["19", "square", "black", "L", "20"],
1715          ["21", "ellipse", "black", "L", "30"],
1716          ["24", "square", "green", "L", "20"],
1717             ];
1718 
1719     string[][] data5x25ExpectedDistinctSampleK2K4P20 =
1720         [["ID", "Shape", "Color", "Size", "Weight"],
1721          ["03", "square", "black", "L", "20"],
1722          ["07", "triangle", "red", "L", "20"],
1723          ["08", "square", "black", "S", "10"],
1724          ["10", "square", "green", "L", "20"],
1725          ["11", "triangle", "red", "L", "20"],
1726          ["16", "square", "red", "S", "10"],
1727          ["18", "square", "red", "S", "20"],
1728          ["19", "square", "black", "L", "20"],
1729          ["22", "triangle", "red", "L", "30"],
1730          ["24", "square", "green", "L", "20"],
1731             ];
1732 
1733     string[][] data5x25ExpectedDistinctSampleK2K3K4P20 =
1734         [["ID", "Shape", "Color", "Size", "Weight"],
1735          ["04", "circle", "green", "L", "30"],
1736          ["07", "triangle", "red", "L", "20"],
1737          ["09", "circle", "black", "S", "20"],
1738          ["11", "triangle", "red", "L", "20"],
1739          ["12", "circle", "green", "L", "30"],
1740          ["14", "circle", "green", "L", "30"],
1741          ["16", "square", "red", "S", "10"],
1742          ["18", "square", "red", "S", "20"],
1743          ["22", "triangle", "red", "L", "30"],
1744             ];
1745 
1746     /*
1747      * Enough setup! Actually run some tests!
1748      */
1749 
1750     /* Basic tests. Headers and static seed. With weights and without. */
1751     testTsvSample(["test-a1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty);
1752     testTsvSample(["test-a2", "--header", "--static-seed", fpath_data3x0], data3x0);
1753     testTsvSample(["test-a3", "-H", "-s", fpath_data3x1], data3x1);
1754     testTsvSample(["test-a4", "-H", "-s", fpath_data3x2], data3x2ExpectedNoWt);
1755     testTsvSample(["test-a5", "-H", "-s", fpath_data3x3], data3x3ExpectedNoWt);
1756     testTsvSample(["test-a6", "-H", "-s", fpath_data3x6], data3x6ExpectedNoWt);
1757     testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedNoWtProbs);
1758     testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedWt3);
1759     testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedWt3Probs);
1760     testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedNoWtV41Probs);
1761     testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedNoWtV41Probs);
1762     testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedNoWtProbs);
1763     testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedWt3V41Probs);
1764 
1765     /* Bernoulli sampling cases. */
1766     testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty);
1767     testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0);
1768     testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1);
1769     testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6);
1770     testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6);
1771     testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedProbsBernoulliSampleP100);
1772     testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedProbsBernoulliSampleP60);
1773     testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliSampleP60);
1774     testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedV41ProbsBernoulliSampleP60);
1775 
1776     /* Distinct sampling cases. */
1777     testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty);
1778     testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0);
1779     testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1);
1780     testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6);
1781     testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctSampleK1K3P60);
1782 
1783     /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling.
1784      * For weighted sampling, use the weighted cases, but with expected using the original ordering.
1785      */
1786     testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedProbsBernoulliSampleP100);
1787     testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedProbsBernoulliSampleP100);
1788     testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
1789                   data3x6ExpectedWt3ProbsInorder);
1790     testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6],
1791                   data3x6ExpectedWt3V41ProbsInorder);
1792     testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6],
1793                   data3x6ExpectedDistinctSampleK1K3P60Probs);
1794     testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header",
1795                    "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctSampleK1K3P60ProbsRVCustom);
1796     testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6],
1797                   data3x6ExpectedDistinctSampleK2P2ProbsInorder);
1798 
1799     /* Simple random sampling with replacement. */
1800     testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty);
1801     testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty);
1802     testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0);
1803     testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0);
1804     testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplace3);
1805     testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplace10);
1806     testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplace10V77);
1807 
1808     /* Basic tests, without headers. */
1809     testTsvSample(["test-b1", "-s", fpath_data3x1_noheader], data3x1[1..$]);
1810     testTsvSample(["test-b2", "-s", fpath_data3x2_noheader], data3x2ExpectedNoWt[1..$]);
1811     testTsvSample(["test-b3", "-s", fpath_data3x3_noheader], data3x3ExpectedNoWt[1..$]);
1812     testTsvSample(["test-b4", "-s", fpath_data3x6_noheader], data3x6ExpectedNoWt[1..$]);
1813     testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedNoWtProbs[1..$]);
1814     testTsvSample(["test-b6", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3[1..$]);
1815     testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedWt3Probs[1..$]);
1816     testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedNoWtV41Probs[1..$]);
1817     testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedWt3V41Probs[1..$]);
1818 
1819     /* Bernoulli sampling cases. */
1820     testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1..$]);
1821     testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]);
1822     testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]);
1823     testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedProbsBernoulliSampleP100[1..$]);
1824     testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedProbsBernoulliSampleP60[1..$]);
1825     testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedV41ProbsBernoulliSampleP60[1..$]);
1826 
1827     /* Distinct sampling cases. */
1828     testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1..$]);
1829     testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]);
1830     testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]);
1831     testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]);
1832 
1833     /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */
1834     testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedProbsBernoulliSampleP100[1..$]);
1835     testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1..$]);
1836     testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader],
1837                   data3x6ExpectedDistinctSampleK1K3P60Probs[1..$]);
1838     testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader],
1839                   data3x6ExpectedDistinctSampleK2P2ProbsInorder[1..$]);
1840 
1841     /* Simple random sampling with replacement. */
1842     testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty);
1843     testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty);
1844     testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplace3[1..$]);
1845     testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplace10[1..$]);
1846     testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplace10V77[1..$]);
1847 
1848     /* Multi-file tests. */
1849     testTsvSample(["test-c1", "--header", "--static-seed",
1850                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
1851                   combo1ExpectedNoWt);
1852     testTsvSample(["test-c2", "--header", "--static-seed", "--print-random",
1853                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
1854                   combo1ExpectedNoWtProbs);
1855     testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3",
1856                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
1857                   combo1ExpectedWt3Probs);
1858     testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3",
1859                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
1860                   combo1ExpectedWt3);
1861 
1862     /* Multi-file, no headers. */
1863     testTsvSample(["test-c5", "--static-seed",
1864                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
1865                    fpath_data3x6_noheader, fpath_data3x2_noheader],
1866                   combo1ExpectedNoWt[1..$]);
1867     testTsvSample(["test-c6", "--static-seed", "--print-random",
1868                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
1869                    fpath_data3x6_noheader, fpath_data3x2_noheader],
1870                   combo1ExpectedNoWtProbs[1..$]);
1871     testTsvSample(["test-c7", "--static-seed", "--print-random", "--weight-field", "3",
1872                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
1873                    fpath_data3x6_noheader, fpath_data3x2_noheader],
1874                   combo1ExpectedWt3Probs[1..$]);
1875     testTsvSample(["test-c8", "--static-seed", "--weight-field", "3",
1876                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
1877                    fpath_data3x6_noheader, fpath_data3x2_noheader],
1878                   combo1ExpectedWt3[1..$]);
1879 
1880     /* Bernoulli sampling cases. */
1881     testTsvSample(["test-c9", "--header", "--static-seed", "--print-random", "--prob", ".5",
1882                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
1883                   combo1ExpectedProbsBernoulliSampleP50);
1884     testTsvSample(["test-c10", "--header", "--static-seed", "--prob", ".4",
1885                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
1886                   combo1ExpectedBernoulliSampleP40);
1887     testTsvSample(["test-c11", "--static-seed", "--print-random", "--prob", ".5",
1888                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
1889                    fpath_data3x6_noheader, fpath_data3x2_noheader],
1890                   combo1ExpectedProbsBernoulliSampleP50[1..$]);
1891     testTsvSample(["test-c12", "--static-seed", "--prob", ".4",
1892                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
1893                    fpath_data3x6_noheader, fpath_data3x2_noheader],
1894                   combo1ExpectedBernoulliSampleP40[1..$]);
1895 
1896     /* Distinct sampling cases. */
1897     testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4",
1898                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
1899                   combo1ExpectedDistinctSampleK1P40);
1900     testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--prob", ".4",
1901                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
1902                    fpath_data3x6_noheader, fpath_data3x2_noheader],
1903                   combo1ExpectedDistinctSampleK1P40[1..$]);
1904 
1905     /* Generating random weights. */
1906     testTsvSample(["test-c15", "--header", "--static-seed", "--gen-random-inorder",
1907                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
1908                   combo1ExpectedNoWtProbsInorder);
1909     testTsvSample(["test-c16", "--static-seed", "--gen-random-inorder",
1910                    fpath_data3x3_noheader, fpath_data3x1_noheader,
1911                    fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader],
1912                   combo1ExpectedNoWtProbsInorder[1..$]);
1913 
1914     /* Simple random sampling with replacement. */
1915     testTsvSample(["test-c17", "--header", "--static-seed", "--replace", "--num", "10",
1916                    fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2],
1917                   combo1ExpectedReplace10);
1918 
1919     testTsvSample(["test-c18", "--static-seed", "--replace", "--num", "10",
1920                    fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty,
1921                    fpath_data3x6_noheader, fpath_data3x2_noheader],
1922                   combo1ExpectedReplace10[1..$]);
1923 
1924     /* Single column file. */
1925     testTsvSample(["test-d1", "-H", "-s", fpath_data1x10], data1x10ExpectedNoWt);
1926     testTsvSample(["test-d1", "-H", "-s", fpath_data1x10], data1x10ExpectedNoWt);
1927 
1928     /* Distributions. */
1929     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedWt2Probs);
1930     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedWt2Probs);
1931     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedWt2Probs);
1932     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedWt2Probs);
1933     testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedWt2Probs);
1934 
1935     /* Tests of subset sample (--n|num) field.
1936      *
1937      * Note: The way these tests are done ensures that subset length does not affect
1938      * output order.
1939      */
1940     import std.algorithm : min;
1941     for (size_t n = data3x6.length + 2; n >= 1; n--)
1942     {
1943         size_t expectedLength = min(data3x6.length, n + 1);
1944         testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string,
1945                        "-H", fpath_data3x6], data3x6ExpectedNoWt[0..expectedLength]);
1946 
1947         testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string,
1948                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedNoWtProbs[0..expectedLength]);
1949 
1950         testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string,
1951                        "-H", "-w", "3", fpath_data3x6], data3x6ExpectedWt3[0..expectedLength]);
1952 
1953         testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string,
1954                        "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedWt3Probs[0..expectedLength]);
1955 
1956         testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string,
1957                        fpath_data3x6_noheader], data3x6ExpectedNoWt[1..expectedLength]);
1958 
1959         testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string,
1960                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedNoWtProbs[1..expectedLength]);
1961 
1962         testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string,
1963                        "-w", "3", fpath_data3x6_noheader], data3x6ExpectedWt3[1..expectedLength]);
1964 
1965         testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string,
1966                        "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedWt3Probs[1..expectedLength]);
1967 
1968         import std.algorithm : min;
1969         size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedProbsBernoulliSampleP60.length);
1970 
1971         testTsvSample([format("test-f9_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
1972                        "-H", "--print-random", fpath_data3x6], data3x6ExpectedProbsBernoulliSampleP60[0..sampleExpectedLength]);
1973 
1974         testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
1975                        "-H", fpath_data3x6], data3x6ExpectedBernoulliSampleP60[0..sampleExpectedLength]);
1976 
1977         testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
1978                        "--print-random", fpath_data3x6_noheader], data3x6ExpectedProbsBernoulliSampleP60[1..sampleExpectedLength]);
1979 
1980         testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string,
1981                        fpath_data3x6_noheader], data3x6ExpectedBernoulliSampleP60[1..sampleExpectedLength]);
1982 
1983         size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctSampleK1K3P60.length);
1984 
1985         testTsvSample([format("test-f13_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
1986                        "-H", fpath_data3x6], data3x6ExpectedDistinctSampleK1K3P60[0..distinctExpectedLength]);
1987 
1988         testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string,
1989                        fpath_data3x6_noheader], data3x6ExpectedDistinctSampleK1K3P60[1..distinctExpectedLength]);
1990 
1991         testTsvSample([format("test-f15_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
1992                        "-H", fpath_data3x6], data3x6ExpectedProbsBernoulliSampleP100[0..expectedLength]);
1993 
1994         testTsvSample([format("test-f15_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string,
1995                        fpath_data3x6_noheader], data3x6ExpectedProbsBernoulliSampleP100[1..expectedLength]);
1996     }
1997 
1998     /* Similar tests with the 1x10 data set. */
1999     for (size_t n = data1x10.length + 2; n >= 1; n--)
2000     {
2001         size_t expectedLength = min(data1x10.length, n + 1);
2002         testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string,
2003                        "-H", fpath_data1x10], data1x10ExpectedNoWt[0..expectedLength]);
2004 
2005         testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string,
2006                        "-H", "-w", "1", fpath_data1x10], data1x10ExpectedWt1[0..expectedLength]);
2007 
2008         testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string,
2009                        fpath_data1x10_noheader], data1x10ExpectedNoWt[1..expectedLength]);
2010 
2011         testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string,
2012                        "-w", "1", fpath_data1x10_noheader], data1x10ExpectedWt1[1..expectedLength]);
2013     }
2014 
2015     /* Simple random sampling with replacement: ensure sample size doesn't change order. */
2016     for (size_t n = data3x6ExpectedReplace10.length - 1; n >= 1; n--)
2017     {
2018         testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6],
2019                       data3x6ExpectedReplace10[0 .. n + 1]);
2020 
2021         testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader],
2022                       data3x6ExpectedReplace10[1 .. n + 1]);
2023     }
2024 
2025     /* Distinct sampling tests. */
2026     testTsvSample(["test-i1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25],
2027                   data5x25ExpectedDistinctSampleK2P40);
2028 
2029     testTsvSample(["test-i2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25],
2030                   data5x25ExpectedDistinctSampleK2K4P20);
2031 
2032     testTsvSample(["test-i3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25],
2033                   data5x25ExpectedDistinctSampleK2K3K4P20);
2034 
2035     testTsvSample(["test-i4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader],
2036                   data5x25ExpectedDistinctSampleK2P40[1..$]);
2037 
2038     testTsvSample(["test-i5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader],
2039                   data5x25ExpectedDistinctSampleK2K4P20[1..$]);
2040 
2041     testTsvSample(["test-i6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader],
2042                   data5x25ExpectedDistinctSampleK2K3K4P20[1..$]);
2043 }