1 /** 2 Command line tool for randomizing or sampling lines from input streams. Several 3 sampling methods are available, including simple random sampling, weighted random 4 sampling, Bernoulli sampling, and distinct sampling. 5 6 Copyright (c) 2017-2018, eBay Software Foundation 7 Initially written by Jon Degenhardt 8 9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 10 */ 11 module tsv_utils.tsv_sample; 12 13 import std.range; 14 import std.stdio; 15 import std.typecons : tuple, Flag; 16 17 version(unittest) 18 { 19 // When running unit tests, use main from -main compiler switch. 20 } 21 else 22 { 23 int main(string[] cmdArgs) 24 { 25 /* When running in DMD code coverage mode, turn on report merging. */ 26 version(D_Coverage) version(DigitalMars) 27 { 28 import core.runtime : dmd_coverSetMerge; 29 dmd_coverSetMerge(true); 30 } 31 32 TsvSampleOptions cmdopt; 33 auto r = cmdopt.processArgs(cmdArgs); 34 if (!r[0]) return r[1]; 35 version(LDC_Profile) 36 { 37 import ldc.profile : resetAll; 38 resetAll(); 39 } 40 try 41 { 42 import tsv_utils.common.utils : BufferedOutputRange; 43 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 44 45 tsvSample(cmdopt, bufferedOutput); 46 } 47 catch (Exception exc) 48 { 49 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 50 return 1; 51 } 52 return 0; 53 } 54 } 55 56 auto helpText = q"EOS 57 Synopsis: tsv-sample [options] [file...] 58 59 Sample input lines or randomize their order. Several modes of operation 60 are available: 61 * Line order randomization (the default): All input lines are output in a 62 random order. All orderings are equally likely. 63 * Weighted line order randomization (--w|weight-field): Lines are selected 64 using weighted random sampling, with the weight taken from a field. 65 Lines are output in weighted selection order, reordering the lines. 66 * Sampling with replacement (--r|replace, --n|num): All input is read into 67 memory, then lines are repeatedly selected at random and written out. This 68 continues until --n|num samples are output. Lines can be selected multiple 69 times. Output continues forever if --n|num is zero or not specified. 70 * Bernoulli sampling (--p|prob): A random subset of lines is output based 71 on an inclusion probability. This is a streaming operation. A selection 72 decision is made on each line as is it read. Line order is not changed. 73 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled 74 based on the values in the key field. A subset of the keys are chosen 75 based on the inclusion probability (a 'distinct' set of keys). All lines 76 with one of the selected keys are output. Line order is not changed. 77 78 The '--n|num' option limits the sample size produced. It speeds up line 79 order randomization and weighted sampling significantly. It is also used 80 to terminate sampling with replacement. 81 82 Use '--help-verbose' for detailed information. 83 84 Options: 85 EOS"; 86 87 auto helpTextVerbose = q"EOS 88 Synopsis: tsv-sample [options] [file...] 89 90 Sample input lines or randomize their order. Several modes of operation 91 are available: 92 * Line order randomization (the default): All input lines are output in a 93 random order. All orderings are equally likely. 94 * Weighted line order randomization (--w|weight-field): Lines are selected 95 using weighted random sampling, with the weight taken from a field. 96 Lines are output in weighted selection order, reordering the lines. 97 * Sampling with replacement (--r|replace, --n|num): All input is read into 98 memory, then lines are repeatedly selected at random and written out. This 99 continues until --n|num samples are output. Lines can be selected multiple 100 times. Output continues forever if --n|num is zero or not specified. 101 * Bernoulli sampling (--p|prob): A random subset of lines is output based 102 on an inclusion probability. This is a streaming operation. A selection 103 decision is made on each line as is it read. Lines order is not changed. 104 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled 105 based on the values in the key field. A subset of the keys are chosen 106 based on the inclusion probability (a 'distinct' set of keys). All lines 107 with one of the selected keys are output. Line order is not changed. 108 109 Sample size: The '--n|num' option limits the sample size produced. This 110 speeds up line order randomization and weighted sampling significantly 111 (details below). It is also used to terminate sampling with replacement. 112 113 Controlling the random seed: By default, each run produces a different 114 randomization or sampling. Using '--s|static-seed' changes this so 115 multiple runs produce the same results. This works by using the same 116 random seed each run. The random seed can be specified using 117 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero 118 value is a no-op and ignored.) 119 120 Memory use: Bernoulli sampling and distinct sampling make decisions on 121 each line as it is read, so there is no memory accumulation. These 122 algorithms support arbitrary size inputs. Sampling with replacement reads 123 all lines into memory and is limited by available memory. The line order 124 randomization algorithms hold the full output set in memory prior to 125 generating results. This ultimately limits the size of the output set. For 126 these memory needs can be reduced by using a sample size (--n|num). This 127 engages reservior sampling. Output order is not affected. Both 128 'tsv-sample -n 1000' and 'tsv-sample | head -n 1000' produce the same 129 results, but the former is quite a bit faster. 130 131 Weighted sampling: Weighted random sampling is done using an algorithm 132 described by Pavlos Efraimidis and Paul Spirakis. Weights should be 133 positive values representing the relative weight of the entry in the 134 collection. Counts and similar can be used as weights, it is *not* 135 necessary to normalize to a [0,1] interval. Negative values are not 136 meaningful and given the value zero. Input order is not retained, instead 137 lines are output ordered by the randomized weight that was assigned. This 138 means that a smaller valid sample can be produced by taking the first N 139 lines of output. For more info on the sampling approach see: 140 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling 141 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis 142 (https://arxiv.org/abs/1012.0256) 143 144 Printing random values: Most of the sampling algorithms work by generating 145 a random value for each line. (See "Compatibility mode" below.) The nature 146 of these values depends on the sampling algorithm. They are used for both 147 line selection and output ordering. The '--p|print-random' option can be 148 used to print these values. The random value is prepended to the line 149 separated by the --d|delimiter char (TAB by default). The 150 '--q|gen-random-inorder' option takes this one step further, generating 151 random values for all input lines without changing the input order. The 152 types of values currently used by these sampling algorithms: 153 * Unweighted sampling: Uniform random value in the interval [0,1]. This 154 includes Bernoulli sampling and unweighted line order randomization. 155 * Weighted sampling: Value in the interval [0,1]. Distribution depends on 156 the values in the weight field. It is used as a partial ordering. 157 * Distinct sampling: An integer, zero and up, representing a selection 158 group. The inclusion probability determines the number of selection groups. 159 * Sampling with replacement: Random value printing is not supported. 160 161 The specifics behind these random values are subject to change in future 162 releases. 163 164 Compatibility mode: As described above, many of the sampling algorithms 165 assign a random value to each line. This is useful when printing random 166 values. It has another occasionally useful property: repeated runs with 167 the same static seed but different selection parameters are more 168 compatible with each other, as each line gets assigned the same random 169 value on every run. For example, if Bernoulli sampling is run with 170 '--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed', 171 all the lines selected in the first run will be selected in the second. 172 This comes at a cost: in some cases there are faster algorithms that don't 173 preserve this property. By default, tsv-sample will use faster algorithms 174 when available. However, the '--compatibility-mode' option switches to 175 algorithms that assign a random value per line. Printing random values 176 also engages compatibility mode. 177 178 Options: 179 EOS"; 180 181 /** Container for command line options. 182 */ 183 struct TsvSampleOptions 184 { 185 string programName; /// Program name 186 string[] files; /// Input files 187 bool helpVerbose = false; /// --help-verbose 188 bool hasHeader = false; /// --H|header 189 size_t sampleSize = 0; /// --n|num - Size of the desired sample 190 double inclusionProbability = double.nan; /// --p|prob - Inclusion probability 191 size_t[] keyFields; /// --k|key-fields - Used with inclusion probability 192 size_t weightField = 0; /// --w|weight-field - Field holding the weight 193 bool srsWithReplacement = false; /// --r|replace 194 bool staticSeed = false; /// --s|static-seed 195 uint seedValueOptionArg = 0; /// --v|seed-value 196 bool printRandom = false; /// --print-random 197 bool genRandomInorder = false; /// --gen-random-inorder 198 string randomValueHeader = "random_value"; /// --random-value-header 199 bool compatibilityMode = false; /// --compatibility-mode 200 char delim = '\t'; /// --d|delimiter 201 bool versionWanted = false; /// --V|version 202 bool preferSkipSampling = false; /// --prefer-skip-sampling 203 bool preferAlgorithmR = false; /// --prefer-algorithm-r 204 bool hasWeightField = false; /// Derived. 205 bool useBernoulliSampling = false; /// Derived. 206 bool useDistinctSampling = false; /// Derived. 207 bool distinctKeyIsFullLine = false; /// Derived. True if '--k|key-fields 0' is specfied. 208 bool usingUnpredictableSeed = true; /// Derived from --static-seed, --seed-value 209 uint seed = 0; /// Derived from --static-seed, --seed-value 210 211 auto processArgs(ref string[] cmdArgs) 212 { 213 import std.algorithm : any, canFind, each; 214 import std.getopt; 215 import std.math : isNaN; 216 import std.path : baseName, stripExtension; 217 import std.typecons : Yes, No; 218 import tsv_utils.common.utils : makeFieldListOptionHandler; 219 220 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 221 222 try 223 { 224 arraySep = ","; // Use comma to separate values in command line options 225 auto r = getopt( 226 cmdArgs, 227 "help-verbose", " Print more detailed help.", &helpVerbose, 228 229 std.getopt.config.caseSensitive, 230 "H|header", " Treat the first line of each file as a header.", &hasHeader, 231 std.getopt.config.caseInsensitive, 232 233 "n|num", "NUM Maximim number of lines to output. All selected lines are output if not provided or zero.", &sampleSize, 234 "p|prob", "NUM Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability, 235 236 "k|key-fields", "<field-list> Fields to use as key for distinct sampling. Use with '--p|prob'. Specify '--k|key-fields 0' to use the entire line as the key.", 237 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero), 238 239 "w|weight-field", "NUM Field containing weights. All lines get equal weight if not provided or zero.", &weightField, 240 "r|replace", " Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement, 241 "s|static-seed", " Use the same random seed every run.", &staticSeed, 242 243 std.getopt.config.caseSensitive, 244 "v|seed-value", "NUM Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg, 245 std.getopt.config.caseInsensitive, 246 247 "print-random", " Include the assigned random value (prepended) when writing output lines.", &printRandom, 248 "gen-random-inorder", " Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder, 249 "random-value-header", " Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader, 250 "compatibility-mode", " Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode, 251 252 "d|delimiter", "CHR Field delimiter.", &delim, 253 254 std.getopt.config.caseSensitive, 255 "V|version", " Print version information and exit.", &versionWanted, 256 std.getopt.config.caseInsensitive, 257 258 "prefer-skip-sampling", " (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.", 259 &preferSkipSampling, 260 261 "prefer-algorithm-r", " (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.", 262 &preferAlgorithmR, 263 ); 264 265 if (r.helpWanted) 266 { 267 defaultGetoptPrinter(helpText, r.options); 268 return tuple(false, 0); 269 } 270 else if (helpVerbose) 271 { 272 defaultGetoptPrinter(helpTextVerbose, r.options); 273 return tuple(false, 0); 274 } 275 else if (versionWanted) 276 { 277 import tsv_utils.common.tsvutils_version; 278 writeln(tsvutilsVersionNotice("tsv-sample")); 279 return tuple(false, 0); 280 } 281 282 /* Derivations and validations. */ 283 if (weightField > 0) 284 { 285 hasWeightField = true; 286 weightField--; // Switch to zero-based indexes. 287 } 288 289 if (srsWithReplacement) 290 { 291 if (hasWeightField) 292 { 293 throw new Exception("Sampling with replacement (--r|replace) does not support wieghts (--w|weight-field)."); 294 } 295 else if (!inclusionProbability.isNaN) 296 { 297 throw new Exception("Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob)."); 298 } 299 else if (keyFields.length > 0) 300 { 301 throw new Exception("Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields)."); 302 } 303 else if (printRandom || genRandomInorder) 304 { 305 throw new Exception("Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder)."); 306 } 307 } 308 309 if (keyFields.length > 0) 310 { 311 /* Note: useDistinctSampling is set as part of the inclusion probability checks below. */ 312 313 if (inclusionProbability.isNaN) throw new Exception("--p|prob is required when using --k|key-fields."); 314 315 if (keyFields.length == 1 && keyFields[0] == 0) 316 { 317 distinctKeyIsFullLine = true; 318 } 319 else 320 { 321 if (keyFields.length > 1 && keyFields.any!(x => x == 0)) 322 { 323 throw new Exception("Whole line as key (--k|key-fields 0) cannot be combined with multiple fields."); 324 } 325 326 keyFields.each!((ref x) => --x); // Convert to zero-based indexing. 327 } 328 } 329 330 /* Inclusion probability (--p|prob) is used for both Bernoulli sampling and distinct sampling. */ 331 if (!inclusionProbability.isNaN) 332 { 333 if (inclusionProbability <= 0.0 || inclusionProbability > 1.0) 334 { 335 import std.format : format; 336 throw new Exception( 337 format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability)); 338 } 339 340 if (keyFields.length > 0) useDistinctSampling = true; 341 else useBernoulliSampling = true; 342 343 if (hasWeightField) throw new Exception("--w|weight-field and --p|prob cannot be used together."); 344 345 if (genRandomInorder && !useDistinctSampling) 346 { 347 throw new Exception("--q|gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used."); 348 } 349 } 350 else if (genRandomInorder && !hasWeightField) 351 { 352 useBernoulliSampling = true; 353 } 354 355 if (randomValueHeader.length == 0 || randomValueHeader.canFind('\n') || 356 randomValueHeader.canFind(delim)) 357 { 358 throw new Exception("--randomValueHeader must be at least one character and not contain field delimiters or newlines."); 359 } 360 361 /* Random value printing implies compatibility-mode, otherwise user's selection is used. */ 362 if (printRandom || genRandomInorder) compatibilityMode = true; 363 364 /* Seed. */ 365 import std.random : unpredictableSeed; 366 367 usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0); 368 369 if (usingUnpredictableSeed) seed = unpredictableSeed; 370 else if (seedValueOptionArg != 0) seed = seedValueOptionArg; 371 else if (staticSeed) seed = 2438424139; 372 else assert(0, "Internal error, invalid seed option states."); 373 374 /* Assume remaining args are files. Use standard input if files were not provided. */ 375 files ~= (cmdArgs.length > 1) ? cmdArgs[1..$] : ["-"]; 376 cmdArgs.length = 1; 377 } 378 catch (Exception exc) 379 { 380 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 381 return tuple(false, 1); 382 } 383 return tuple(true, 0); 384 } 385 } 386 /** Invokes the appropriate sampling routine based on the command line arguments. 387 */ 388 void tsvSample(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 389 if (isOutputRange!(OutputRange, char)) 390 { 391 if (cmdopt.srsWithReplacement) 392 { 393 simpleRandomSamplingWithReplacement(cmdopt, outputStream); 394 } 395 else if (cmdopt.useBernoulliSampling) 396 { 397 bernoulliSamplingCommand(cmdopt, outputStream); 398 } 399 else if (cmdopt.useDistinctSampling) 400 { 401 if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 402 else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream); 403 } 404 else if (cmdopt.genRandomInorder) 405 { 406 /* Note that the preceeding cases handle gen-random-inorder themselves (Bernoulli, 407 * Distinct), or don't handle it (SRS w/ Replacement). 408 */ 409 assert(cmdopt.hasWeightField); 410 generateWeightedRandomValuesInorder(cmdopt, outputStream); 411 } 412 else if (cmdopt.sampleSize != 0) 413 { 414 reservoirSamplingCommand(cmdopt, outputStream); 415 } 416 else 417 { 418 randomizeLinesCommand(cmdopt, outputStream); 419 } 420 } 421 422 /** Invokes the appropriate Bernoulli sampling routine based on the command line arguments. 423 * 424 * This routine selects the appropriate bernoulli sampling function and template 425 * instantiation to use based on the command line arguments. 426 * 427 * See the bernoulliSkipSampling routine for a discussion of the choices behind the 428 * skipSamplingProbabilityThreshold used here. 429 */ 430 void bernoulliSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 431 if (isOutputRange!(OutputRange, char)) 432 { 433 assert(!cmdopt.hasWeightField); 434 435 immutable double skipSamplingProbabilityThreshold = 0.04; 436 437 if (cmdopt.compatibilityMode || 438 (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling)) 439 { 440 if (cmdopt.genRandomInorder) 441 { 442 bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 443 } 444 else 445 { 446 bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream); 447 } 448 } 449 else 450 { 451 bernoulliSkipSampling(cmdopt, outputStream); 452 } 453 } 454 455 /** Bernoulli sampling of lines on the input stream. 456 * 457 * Each input line is a assigned a random value and output if less than 458 * cmdopt.inclusionProbability. The order of the lines is not changed. 459 * 460 * This routine supports random value printing and gen-random-inorder value printing. 461 */ 462 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) 463 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 464 if (isOutputRange!(OutputRange, char)) 465 { 466 import std.format : formatValue, singleSpec; 467 import std.random : Random = Mt19937, uniform01; 468 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 469 470 static if (generateRandomAll) assert(cmdopt.genRandomInorder); 471 else assert(!cmdopt.genRandomInorder); 472 473 auto randomGenerator = Random(cmdopt.seed); 474 immutable randomValueFormatSpec = singleSpec("%.17g"); 475 476 /* Process each line. */ 477 bool headerWritten = false; 478 size_t numLinesWritten = 0; 479 foreach (filename; cmdopt.files) 480 { 481 auto inputStream = (filename == "-") ? stdin : filename.File(); 482 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 483 { 484 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 485 if (fileLineNum == 1 && cmdopt.hasHeader) 486 { 487 if (!headerWritten) 488 { 489 static if (generateRandomAll) 490 { 491 outputStream.put(cmdopt.randomValueHeader); 492 outputStream.put(cmdopt.delim); 493 } 494 else if (cmdopt.printRandom) 495 { 496 outputStream.put(cmdopt.randomValueHeader); 497 outputStream.put(cmdopt.delim); 498 } 499 500 outputStream.put(line); 501 outputStream.put("\n"); 502 headerWritten = true; 503 } 504 } 505 else 506 { 507 double lineScore = uniform01(randomGenerator); 508 509 static if (generateRandomAll) 510 { 511 outputStream.formatValue(lineScore, randomValueFormatSpec); 512 outputStream.put(cmdopt.delim); 513 outputStream.put(line); 514 outputStream.put("\n"); 515 516 if (cmdopt.sampleSize != 0) 517 { 518 ++numLinesWritten; 519 if (numLinesWritten == cmdopt.sampleSize) return; 520 } 521 } 522 else if (lineScore < cmdopt.inclusionProbability) 523 { 524 if (cmdopt.printRandom) 525 { 526 outputStream.formatValue(lineScore, randomValueFormatSpec); 527 outputStream.put(cmdopt.delim); 528 } 529 outputStream.put(line); 530 outputStream.put("\n"); 531 532 if (cmdopt.sampleSize != 0) 533 { 534 ++numLinesWritten; 535 if (numLinesWritten == cmdopt.sampleSize) return; 536 } 537 } 538 } 539 } 540 } 541 } 542 543 /** bernoulliSkipSampling is an implementation of Bernoulli sampling using skips. 544 * 545 * Skip sampling works by skipping a random number of lines between selections. This 546 * can be faster than assigning a random value to each line when the inclusion 547 * probability is low, as it reduces the number of calls to the random number 548 * generator. Both the random number generator and the log() function as called when 549 * calculating the next skip size. These additional log() calls add up as the 550 * probability increases. 551 * 552 * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for 553 * file-oriented line sampling. This is obviously environment specific. In the 554 * environments this implementation has been tested in the perfmance improvements 555 * remain small, less than 7%, even with an inclusion probability as low as 0.0001. 556 * 557 * The algorithm does not assign random values to individual lines. This makes it 558 * incompatible with random value printing. It is not suitable for compatibility mode 559 * either. As an example, in compatibility mode a line selected with '--prob 0.2' should 560 * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling 561 * does not have this property. 562 * 563 * The algorithm for calculating the skip size has been described by multiple sources. 564 * There are two key variants depending on whether the total number of lines in the 565 * data set is known in advance. (This implementation does not know the total.) 566 * Useful references: 567 * $(LIST 568 * * Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling", 569 * ACM Trans on Mathematical Software, 1987. On-line: 570 * http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf 571 * * P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book 572 * "Data Stream Management", Springer-Verlag, 2016. On-line: 573 * https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf 574 * * Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line: 575 * http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/ 576 * ) 577 */ 578 void bernoulliSkipSampling(OutputRange)(TsvSampleOptions cmdopt, OutputRange outputStream) 579 if (isOutputRange!(OutputRange, char)) 580 { 581 import std.conv : to; 582 import std.math : log, trunc; 583 import std.random : Random = Mt19937, uniform01; 584 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 585 586 assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0); 587 assert(!cmdopt.printRandom); 588 assert(!cmdopt.compatibilityMode); 589 590 auto randomGenerator = Random(cmdopt.seed); 591 592 immutable double discardRate = 1.0 - cmdopt.inclusionProbability; 593 immutable double logDiscardRate = log(discardRate); 594 595 /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed 596 * interval to (0.0, 1.0], excluding 0.0. 597 */ 598 size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t; 599 600 /* Process each line. */ 601 bool headerWritten = false; 602 size_t numLinesWritten = 0; 603 foreach (filename; cmdopt.files) 604 { 605 auto inputStream = (filename == "-") ? stdin : filename.File(); 606 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 607 { 608 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 609 if (fileLineNum == 1 && cmdopt.hasHeader) 610 { 611 if (!headerWritten) 612 { 613 outputStream.put(line); 614 outputStream.put("\n"); 615 headerWritten = true; 616 } 617 } 618 else if (remainingSkips > 0) 619 { 620 --remainingSkips; 621 } 622 else 623 { 624 outputStream.put(line); 625 outputStream.put("\n"); 626 627 if (cmdopt.sampleSize != 0) 628 { 629 ++numLinesWritten; 630 if (numLinesWritten == cmdopt.sampleSize) return; 631 } 632 633 remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t; 634 } 635 } 636 } 637 } 638 639 /** Sample a subset of the unique values from the key fields. 640 * 641 * Distinct sampling is done by hashing the key and mapping the hash value into 642 * buckets matching the inclusion probability. Records having a key mapping to bucket 643 * zero are output. 644 */ 645 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) 646 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 647 if (isOutputRange!(OutputRange, char)) 648 { 649 import std.algorithm : splitter; 650 import std.conv : to; 651 import std.digest.murmurhash; 652 import std.math : lrint; 653 import tsv_utils.common.utils : InputFieldReordering, throwIfWindowsNewlineOnUnix; 654 655 static if (generateRandomAll) assert(cmdopt.genRandomInorder); 656 else assert(!cmdopt.genRandomInorder); 657 658 assert(cmdopt.keyFields.length > 0); 659 assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0); 660 661 static if (generateRandomAll) 662 { 663 import std.format : formatValue, singleSpec; 664 immutable randomValueFormatSpec = singleSpec("%d"); 665 } 666 667 immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys. 668 669 uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint; 670 671 /* Create a mapping for the key fields. */ 672 auto keyFieldsReordering = cmdopt.distinctKeyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields); 673 674 /* Process each line. */ 675 bool headerWritten = false; 676 size_t numLinesWritten = 0; 677 foreach (filename; cmdopt.files) 678 { 679 auto inputStream = (filename == "-") ? stdin : filename.File(); 680 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 681 { 682 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 683 if (fileLineNum == 1 && cmdopt.hasHeader) 684 { 685 if (!headerWritten) 686 { 687 static if (generateRandomAll) 688 { 689 outputStream.put(cmdopt.randomValueHeader); 690 outputStream.put(cmdopt.delim); 691 } 692 else if (cmdopt.printRandom) 693 { 694 outputStream.put(cmdopt.randomValueHeader); 695 outputStream.put(cmdopt.delim); 696 } 697 698 outputStream.put(line); 699 outputStream.put("\n"); 700 headerWritten = true; 701 } 702 } 703 else 704 { 705 /* Murmurhash works by successively adding individual keys, then finalizing. 706 * Adding individual keys is simpler if the full-line-as-key and individual 707 * fields as keys cases are separated. 708 */ 709 auto hasher = MurmurHash3!32(cmdopt.seed); 710 711 if (cmdopt.distinctKeyIsFullLine) 712 { 713 hasher.put(cast(ubyte[]) line); 714 } 715 else 716 { 717 assert(keyFieldsReordering !is null); 718 719 /* Gather the key field values and assemble the key. */ 720 keyFieldsReordering.initNewLine; 721 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 722 { 723 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 724 if (keyFieldsReordering.allFieldsFilled) break; 725 } 726 727 if (!keyFieldsReordering.allFieldsFilled) 728 { 729 import std.format : format; 730 throw new Exception( 731 format("Not enough fields in line. File: %s, Line: %s", 732 (filename == "-") ? "Standard Input" : filename, fileLineNum)); 733 } 734 735 foreach (count, key; keyFieldsReordering.outputFields.enumerate) 736 { 737 if (count > 0) hasher.put(delimArray); 738 hasher.put(cast(ubyte[]) key); 739 } 740 } 741 742 hasher.finish; 743 744 static if (generateRandomAll) 745 { 746 import std.conv : to; 747 outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec); 748 outputStream.put(cmdopt.delim); 749 outputStream.put(line); 750 outputStream.put("\n"); 751 752 if (cmdopt.sampleSize != 0) 753 { 754 ++numLinesWritten; 755 if (numLinesWritten == cmdopt.sampleSize) return; 756 } 757 } 758 else if (hasher.get % numBuckets == 0) 759 { 760 if (cmdopt.printRandom) 761 { 762 outputStream.put('0'); 763 outputStream.put(cmdopt.delim); 764 } 765 outputStream.put(line); 766 outputStream.put("\n"); 767 768 if (cmdopt.sampleSize != 0) 769 { 770 ++numLinesWritten; 771 if (numLinesWritten == cmdopt.sampleSize) return; 772 } 773 } 774 } 775 } 776 } 777 } 778 779 /** Invokes the appropriate reservoir sampling routine based on the command line 780 * arguments. 781 * 782 * This routine selects the appropriate reservior sampling function and template 783 * instantiation to use based on the command line arguments. 784 * 785 * Reservoir sampling is used when a fixed size sample is being pulled from an input 786 * stream. Weighted and unweighted sampling is supported. These routines also 787 * randomize the order of the selected lines. This is consistent with line order 788 * randomization of the entire input stream (handled by randomizeLinesCommand). 789 * 790 * For unweighted sampling, there is a performance tradeoff choice between the two 791 * available implementations. See the reservoirSampling documentation for 792 * information. The threshold used here was chosen based on performance tests. 793 */ 794 795 void reservoirSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 796 if (isOutputRange!(OutputRange, char)) 797 { 798 assert(cmdopt.sampleSize != 0); 799 800 immutable size_t algorithmRSampleSizeThreshold = 128 * 1024; 801 802 if (cmdopt.hasWeightField) 803 { 804 reservoirSamplingViaHeap!(Yes.isWeighted)(cmdopt, outputStream); 805 } 806 else if (cmdopt.compatibilityMode || 807 (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR)) 808 { 809 reservoirSamplingViaHeap!(No.isWeighted)(cmdopt, outputStream); 810 } 811 else 812 { 813 reservoirSamplingAlgorithmR(cmdopt, outputStream); 814 } 815 } 816 817 /** Reservior sampling using a heap. Both weighted and unweighted random sampling are 818 * supported. 819 * 820 * The algorithm used here is based on the one-pass algorithm described by Pavlos 821 * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S. 822 * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are 823 * simply set to one. 824 * 825 * The implementation uses a heap (priority queue) large enough to hold the desired 826 * number of lines. Input is read line-by-line, assigned a random value, and added to 827 * the heap. The role of the identify the lines with the highest assigned random 828 * values. Once the heap is full, adding a new line means dropping the line with the 829 * lowest score. A "min" heap used for this reason. 830 * 831 * When done reading all lines, the "min" heap is in the opposite order needed for 832 * output. The desired order is obtained by removing each element one at at time from 833 * the heap. The underlying data store will have the elements in correct order. 834 * 835 * Generating output in weighted order matters for several reasons: 836 * - For weighted sampling, it preserves the property that smaller valid subsets can be 837 * created by taking the first N lines. 838 * - For unweighted sampling, it ensures that all output permutations are possible, and 839 * are not influences by input order or the heap data structure used. 840 * - Order consistency when making repeated use of the same random seeds, but with 841 * different sample sizes. 842 * 843 * There are use cases where only the selection set matters, for these some performance 844 * could be gained by skipping the reordering and simply printing the backing store 845 * array in-order, but making this distinction seems an unnecessary complication. 846 * 847 * Notes: 848 * $(LIST 849 * * In tsv-sample versions 1.2.1 and earlier this routine also supported 850 * randomization of all input lines. This was dropped in version 1.2.2 in favor 851 * of the approach used in randomizeLines. The latter has significant advantages 852 * given that all data data must be read into memory. 853 * * For larger reservoir sizes better performance can be achieved by using 854 * reservoirSamplingAlgorithmR. See the documentation of that function for details. 855 * ) 856 */ 857 void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, OutputRange) 858 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 859 if (isOutputRange!(OutputRange, char)) 860 { 861 import std.container.array; 862 import std.container.binaryheap; 863 import std.format : formatValue, singleSpec; 864 import std.random : Random = Mt19937, uniform01; 865 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 866 867 static if (isWeighted) assert(cmdopt.hasWeightField); 868 else assert(!cmdopt.hasWeightField); 869 870 assert(cmdopt.sampleSize > 0); 871 872 auto randomGenerator = Random(cmdopt.seed); 873 874 struct Entry 875 { 876 double score; 877 char[] line; 878 } 879 880 /* Create the heap and backing data store. 881 * 882 * Note: An std.container.array is used as the backing store to avoid some issues in 883 * the standard library (Phobos) binaryheap implementation. Specifically, when an 884 * std.container.array is used as backing store, the heap can efficiently reversed by 885 * removing the heap elements. This leaves the backing store in the reversed order. 886 * However, the current binaryheap implementation does not support this for all 887 * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094. 888 */ 889 890 Array!Entry dataStore; 891 dataStore.reserve(cmdopt.sampleSize); 892 auto reservoir = dataStore.heapify!("a.score > b.score")(0); // Min binaryheap 893 894 /* Process each line. */ 895 bool headerWritten = false; 896 foreach (filename; cmdopt.files) 897 { 898 auto inputStream = (filename == "-") ? stdin : filename.File(); 899 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 900 { 901 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 902 if (fileLineNum == 1 && cmdopt.hasHeader) 903 { 904 if (!headerWritten) 905 { 906 if (cmdopt.printRandom) 907 { 908 outputStream.put(cmdopt.randomValueHeader); 909 outputStream.put(cmdopt.delim); 910 } 911 outputStream.put(line); 912 outputStream.put("\n"); 913 headerWritten = true; 914 } 915 } 916 else 917 { 918 static if (!isWeighted) 919 { 920 double lineScore = uniform01(randomGenerator); 921 } 922 else 923 { 924 double lineWeight = 925 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, filename, fileLineNum); 926 double lineScore = 927 (lineWeight > 0.0) 928 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 929 : 0.0; 930 } 931 932 if (reservoir.length < cmdopt.sampleSize) 933 { 934 reservoir.insert(Entry(lineScore, line.dup)); 935 } 936 else if (reservoir.front.score < lineScore) 937 { 938 reservoir.replaceFront(Entry(lineScore, line.dup)); 939 } 940 } 941 } 942 } 943 944 /* All entries are in the reservoir. Time to print. The heap is in reverse order 945 * of assigned weights. Reversing order is done by removing all elements from the 946 * heap, this leaves the backing store in the correct order for output. 947 * 948 * The asserts here avoid issues with the current binaryheap implementation. They 949 * detect use of backing stores having a length not synchronized to the reservoir. 950 */ 951 size_t numLines = reservoir.length; 952 assert(numLines == dataStore.length); 953 954 while (!reservoir.empty) reservoir.removeFront; 955 assert(numLines == dataStore.length); 956 957 immutable randomValueFormatSpec = singleSpec("%.17g"); 958 959 foreach (entry; dataStore) 960 { 961 if (cmdopt.printRandom) 962 { 963 outputStream.formatValue(entry.score, randomValueFormatSpec); 964 outputStream.put(cmdopt.delim); 965 } 966 outputStream.put(entry.line); 967 outputStream.put("\n"); 968 } 969 } 970 971 /** Generates weighted random values for all input lines, preserving input order. 972 * 973 * This complements weighted reservoir sampling, but instead of using a reservoir it 974 * simply iterates over the input lines generating the values. The weighted random 975 * values are generated with the same formula used by reservoirSampling. 976 */ 977 void generateWeightedRandomValuesInorder(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 978 if (isOutputRange!(OutputRange, char)) 979 { 980 import std.format : formatValue, singleSpec; 981 import std.random : Random = Mt19937, uniform01; 982 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 983 984 assert(cmdopt.hasWeightField); 985 986 auto randomGenerator = Random(cmdopt.seed); 987 immutable randomValueFormatSpec = singleSpec("%.17g"); 988 989 /* Process each line. */ 990 bool headerWritten = false; 991 size_t numLinesWritten = 0; 992 foreach (filename; cmdopt.files) 993 { 994 auto inputStream = (filename == "-") ? stdin : filename.File(); 995 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 996 { 997 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 998 if (fileLineNum == 1 && cmdopt.hasHeader) 999 { 1000 if (!headerWritten) 1001 { 1002 outputStream.put(cmdopt.randomValueHeader); 1003 outputStream.put(cmdopt.delim); 1004 outputStream.put(line); 1005 outputStream.put("\n"); 1006 headerWritten = true; 1007 } 1008 } 1009 else 1010 { 1011 double lineWeight = getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, 1012 filename, fileLineNum); 1013 double lineScore = 1014 (lineWeight > 0.0) 1015 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1016 : 0.0; 1017 1018 outputStream.formatValue(lineScore, randomValueFormatSpec); 1019 outputStream.put(cmdopt.delim); 1020 outputStream.put(line); 1021 outputStream.put("\n"); 1022 1023 if (cmdopt.sampleSize != 0) 1024 { 1025 ++numLinesWritten; 1026 if (numLinesWritten == cmdopt.sampleSize) return; 1027 } 1028 } 1029 } 1030 } 1031 } 1032 1033 /** Reservoir sampling via Algorithm R 1034 * 1035 * This is an implementation of reservoir sampling using what is commonly known as 1036 * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of 1037 * Computer Programming, Volume 2: Seminumerical Algorithms". More information about 1038 * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with 1039 * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling" 1040 * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R). 1041 * 1042 * Algorithm R is used for unweighted sampling without replacement. The heap-based 1043 * algorithm in reservoirSamplingViaHeap is used for weighted sampling. 1044 * 1045 * The classic algorithm stops after identifying the selected set of items. This 1046 * implementation goes one step further and randomizes the order of the selected 1047 * lines. This supports the tsv-sample use-case, which is line order randomization. 1048 * 1049 * This algorithm is faster than reservoirSamplingViaHeap when the sample size 1050 * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size. 1051 * Insertion in this algorithm is O(1). Similarly, generating the random order in the 1052 * heap is O(k * log k), while in this algorithm the final randomization step is O(k). 1053 * 1054 * This speed advantage may be offset a certain amount by using a more expensive random 1055 * value generator. reservoirSamplingViaHeap generates values between zero and one, 1056 * whereas reservoirSamplingAlgorithR generates random integers over and ever growing 1057 * interval. The latter is expected to be more expensive. This is consistent with 1058 * performance test indicating that reservoirSamplingViaHeap is faster when using 1059 * small-to-medium size reservoirs and large input streams. 1060 */ 1061 void reservoirSamplingAlgorithmR(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1062 if (isOutputRange!(OutputRange, char)) 1063 { 1064 import std.random : Random = Mt19937, randomShuffle, uniform; 1065 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 1066 1067 assert(cmdopt.sampleSize > 0); 1068 assert(!cmdopt.hasWeightField); 1069 assert(!cmdopt.compatibilityMode); 1070 assert(!cmdopt.printRandom); 1071 assert(!cmdopt.genRandomInorder); 1072 1073 string[] reservoir; 1074 auto reservoirAppender = appender(&reservoir); 1075 reservoirAppender.reserve(cmdopt.sampleSize); 1076 1077 auto randomGenerator = Random(cmdopt.seed); 1078 1079 /* Process each line. */ 1080 1081 bool headerWritten = false; 1082 size_t totalLineNum = 0; 1083 foreach (filename; cmdopt.files) 1084 { 1085 auto inputStream = (filename == "-") ? stdin : filename.File(); 1086 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 1087 { 1088 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 1089 if (fileLineNum == 1 && cmdopt.hasHeader) 1090 { 1091 if (!headerWritten) 1092 { 1093 outputStream.put(line); 1094 outputStream.put("\n"); 1095 headerWritten = true; 1096 } 1097 } 1098 else 1099 { 1100 /* Add lines to the reservoir until the reservoir is filled. 1101 * After that lines are added with decreasing likelihood, based on 1102 * the total number of lines seen. If added to the reservoir, the 1103 * line replaces a randomly chosen existing line. 1104 */ 1105 if (totalLineNum < cmdopt.sampleSize) 1106 { 1107 reservoirAppender ~= line.idup; 1108 } 1109 else 1110 { 1111 size_t i = uniform(0, totalLineNum, randomGenerator); 1112 if (i < reservoir.length) reservoir[i] = line.idup; 1113 } 1114 1115 ++totalLineNum; 1116 } 1117 } 1118 } 1119 1120 /* The random sample is now in the reservior. Shuffle it and print. */ 1121 1122 reservoir.randomShuffle(randomGenerator); 1123 1124 foreach (ref line; reservoir) 1125 { 1126 outputStream.put(line); 1127 outputStream.put("\n"); 1128 } 1129 } 1130 1131 /** Invokes the appropriate routine to randomize input lines based on the command line 1132 * arguments. 1133 * 1134 * This routine selects the appropriate randomize lines function and template instantiation 1135 * to use based on the command line arguments. 1136 */ 1137 void randomizeLinesCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1138 if (isOutputRange!(OutputRange, char)) 1139 { 1140 if (cmdopt.hasWeightField) 1141 { 1142 randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream); 1143 } 1144 else if (cmdopt.compatibilityMode) 1145 { 1146 randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream); 1147 } 1148 else 1149 { 1150 randomizeLinesViaShuffle(cmdopt, outputStream); 1151 } 1152 } 1153 1154 /** Randomize all the lines in files or standard input using assigned random weights 1155 * and sorting. 1156 * 1157 * All lines in files and/or standard input are read in and written out in random 1158 * order. This algorithm assigns a random value to each line and sorts. This approach 1159 * supports both weighted sampling and simple random sampling (unweighted). 1160 * 1161 * This is significantly faster than heap-based reservoir sampling in the case where 1162 * the entire file is being read. See also randomizeLinesViaShuffle for the unweighted 1163 * case, as it is a little faster, at the cost not supporting random value printing or 1164 * compatibility-mode. 1165 * 1166 * Input data size is limited by available memory. Disk oriented techniques are needed 1167 * when data sizes are larger. For example, generating random values line-by-line (ala 1168 * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort. 1169 */ 1170 void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1171 if (isOutputRange!(OutputRange, char)) 1172 { 1173 import std.algorithm : map, sort; 1174 import std.format : formatValue, singleSpec; 1175 1176 static if (isWeighted) assert(cmdopt.hasWeightField); 1177 else assert(!cmdopt.hasWeightField); 1178 1179 assert(cmdopt.sampleSize == 0); 1180 1181 /* 1182 * Read all file data into memory. Then split the data into lines and assign a 1183 * random value to each line. identifyFileLines also writes the first header line. 1184 */ 1185 auto fileData = cmdopt.files.map!FileData.array; 1186 auto inputLines = fileData.identifyFileLines!(Yes.hasRandomValue, isWeighted)(cmdopt, outputStream); 1187 1188 /* 1189 * Sort by the weight and output the lines. 1190 */ 1191 inputLines.sort!((a, b) => a.randomValue > b.randomValue); 1192 1193 immutable randomValueFormatSpec = singleSpec("%.17g"); 1194 1195 foreach (lineEntry; inputLines) 1196 { 1197 if (cmdopt.printRandom) 1198 { 1199 outputStream.formatValue(lineEntry.randomValue, randomValueFormatSpec); 1200 outputStream.put(cmdopt.delim); 1201 } 1202 outputStream.put(lineEntry.data); 1203 outputStream.put("\n"); 1204 } 1205 } 1206 1207 /** Randomize all the lines in files or standard input using a shuffling algorithm. 1208 * 1209 * All lines in files and/or standard input are read in and written out in random 1210 * order. This routine uses array shuffling, which is faster than sorting. This makes 1211 * this routine a good alternative to randomizeLinesViaSort when doing unweighted 1212 * randomization. 1213 * 1214 * Input data size is limited by available memory. Disk oriented techniques are needed 1215 * when data sizes are larger. For example, generating random values line-by-line (ala 1216 * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort. 1217 * 1218 * This routine does not support random value printing or compatibility-mode. 1219 */ 1220 void randomizeLinesViaShuffle(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1221 if (isOutputRange!(OutputRange, char)) 1222 { 1223 import std.algorithm : map; 1224 import std.random : Random = Mt19937, randomShuffle; 1225 1226 assert(cmdopt.sampleSize == 0); 1227 assert(!cmdopt.hasWeightField); 1228 assert(!cmdopt.printRandom); 1229 assert(!cmdopt.genRandomInorder); 1230 1231 /* 1232 * Read all file data into memory and split into lines. 1233 */ 1234 auto fileData = cmdopt.files.map!FileData.array; 1235 auto inputLines = fileData.identifyFileLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream); 1236 1237 /* 1238 * Randomly shuffle and print each line. 1239 * 1240 * Note: Also tried randomCover, but that was exceedingly slow. 1241 */ 1242 import std.random : randomShuffle; 1243 1244 auto randomGenerator = Random(cmdopt.seed); 1245 inputLines.randomShuffle(randomGenerator); 1246 1247 foreach (ref line; inputLines) 1248 { 1249 outputStream.put(line.data); 1250 outputStream.put("\n"); 1251 } 1252 } 1253 1254 /** Simple random sampling with replacement. 1255 * 1256 * All lines in files and/or standard input are read in. Then random lines are selected 1257 * one at a time and output. Lines can be selected multiple times. This process continues 1258 * until the desired number of samples (--n|num) has been output. Output continues 1259 * indefinitely if a sample size was not provided. 1260 */ 1261 void simpleRandomSamplingWithReplacement(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1262 if (isOutputRange!(OutputRange, char)) 1263 { 1264 import std.algorithm : map; 1265 import std.format : formatValue, singleSpec; 1266 import std.random : Random = Mt19937, uniform; 1267 1268 /* 1269 * Read all file data into memory and split the data into lines. 1270 */ 1271 auto fileData = cmdopt.files.map!FileData.array; 1272 auto inputLines = fileData.identifyFileLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream); 1273 1274 if (inputLines.length > 0) 1275 { 1276 auto randomGenerator = Random(cmdopt.seed); 1277 1278 /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */ 1279 size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize; 1280 while (numLeft != 0) 1281 { 1282 size_t index = uniform(0, inputLines.length, randomGenerator); 1283 outputStream.put(inputLines[index].data); 1284 outputStream.put("\n"); 1285 if (cmdopt.sampleSize != 0) numLeft--; 1286 } 1287 } 1288 } 1289 1290 /** A container and reader data form a file or standard input. 1291 * 1292 * The FileData struct is used to read data from a file or standard input. It is used 1293 * by passing a filename to the constructor. The constructor reads the file data. 1294 * If the filename is a single hyphen ('-') then data is read from standard input. 1295 * 1296 * The struct make the data available through two members: 'filename', which is the 1297 * filename, and 'data', which is a character array of the data. 1298 */ 1299 struct FileData 1300 { 1301 string filename; 1302 char[] data; 1303 1304 this(string fname) 1305 { 1306 import std.algorithm : min; 1307 import std.array : appender; 1308 1309 filename = fname; 1310 1311 ubyte[1024 * 128] fileRawBuf; 1312 auto dataAppender = appender(&data); 1313 auto ifile = (filename == "-") ? stdin : filename.File; 1314 1315 if (filename != "-") 1316 { 1317 ulong filesize = ifile.size; 1318 if (filesize < ulong.max) dataAppender.reserve(min(filesize, size_t.max)); 1319 } 1320 1321 foreach (ref ubyte[] buffer; ifile.byChunk(fileRawBuf)) dataAppender.put(cast(char[]) buffer); 1322 } 1323 } 1324 1325 /** HasRandomValue is a boolean flag used at compile time by identifyFileLines to 1326 * distinguish use cases needing random value assignments from those that don't. 1327 */ 1328 alias HasRandomValue = Flag!"hasRandomValue"; 1329 1330 /** An InputLine array is returned by identifyFileLines to represent each non-header line 1331 * line found in a FileData array. The 'data' element contains the line. A 'randomValue' 1332 * line is included if random values are being generated. 1333 */ 1334 struct InputLine(HasRandomValue hasRandomValue) 1335 { 1336 char[] data; 1337 static if (hasRandomValue) double randomValue; 1338 } 1339 1340 /** identifyFileLines is used by algorithms that read all files into memory prior to 1341 * processing. It does the initial processing of the file data. 1342 * 1343 * Three primary tasks are performed. One is splitting all input data into lines. The 1344 * second is writting the header line from the first file to the output stream. Header 1345 * lines from subsequent files are ignored. Third is assigning a random value to the 1346 * line, if random values are being generated. 1347 * 1348 * The key input is a FileData array, one element for each file. The FileData reads 1349 * the file when instantiated. 1350 * 1351 * The return value is an array of InputLine structs. The struct will have a 'randomValue' 1352 * member if random values are being assigned. 1353 */ 1354 InputLine!hasRandomValue[] identifyFileLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted, OutputRange) 1355 (ref FileData[] fileData, TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1356 if (isOutputRange!(OutputRange, char)) 1357 { 1358 import std.algorithm : splitter; 1359 import std.array : appender; 1360 import std.random : Random = Mt19937, uniform01; 1361 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 1362 1363 static assert(hasRandomValue || !isWeighted); 1364 static if(!hasRandomValue) assert(!cmdopt.printRandom); 1365 1366 InputLine!hasRandomValue[] inputLines; 1367 1368 auto linesAppender = appender(&inputLines); 1369 static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed); 1370 bool headerWritten = false; 1371 1372 foreach (fd; fileData) 1373 { 1374 /* Drop the last newline to avoid adding an extra empty line. */ 1375 auto data = (fd.data.length > 0 && fd.data[$ - 1] == '\n') ? fd.data[0 .. $ - 1] : fd.data; 1376 foreach (fileLineNum, ref line; data.splitter('\n').enumerate(1)) 1377 { 1378 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, fd.filename, fileLineNum); 1379 if (fileLineNum == 1 && cmdopt.hasHeader) 1380 { 1381 if (!headerWritten) 1382 { 1383 if (cmdopt.printRandom) 1384 { 1385 outputStream.put(cmdopt.randomValueHeader); 1386 outputStream.put(cmdopt.delim); 1387 } 1388 outputStream.put(line); 1389 outputStream.put("\n"); 1390 headerWritten = true; 1391 } 1392 } 1393 else 1394 { 1395 static if (!hasRandomValue) 1396 { 1397 linesAppender.put(InputLine!hasRandomValue(line)); 1398 } 1399 else 1400 { 1401 static if (!isWeighted) 1402 { 1403 double randomValue = uniform01(randomGenerator); 1404 } 1405 else 1406 { 1407 double lineWeight = 1408 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, 1409 fd.filename, fileLineNum); 1410 double randomValue = 1411 (lineWeight > 0.0) 1412 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1413 : 0.0; 1414 } 1415 1416 linesAppender.put(InputLine!hasRandomValue(line, randomValue)); 1417 } 1418 } 1419 } 1420 } 1421 1422 return inputLines; 1423 } 1424 1425 1426 /** Convenience function for extracting a single field from a line. See 1427 * [tsv_utils.common.utils.getTsvFieldValue] for details. This wrapper creates error 1428 * text tailored for this program. 1429 */ 1430 import std.traits : isSomeChar; 1431 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, size_t lineNum) pure @safe 1432 if (isSomeChar!C) 1433 { 1434 import std.conv : ConvException, to; 1435 import std.format : format; 1436 import tsv_utils.common.utils : getTsvFieldValue; 1437 1438 T val; 1439 try 1440 { 1441 val = getTsvFieldValue!T(line, fieldIndex, delim); 1442 } 1443 catch (ConvException exc) 1444 { 1445 throw new Exception( 1446 format("Could not process line: %s\n File: %s Line: %s%s", 1447 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 1448 (lineNum == 1) ? "\n Is this a header line? Use --H|header to skip." : "")); 1449 } 1450 catch (Exception exc) 1451 { 1452 /* Not enough fields on the line. */ 1453 throw new Exception( 1454 format("Could not process line: %s\n File: %s Line: %s", 1455 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum)); 1456 } 1457 1458 return val; 1459 } 1460 1461 unittest 1462 { 1463 /* getFieldValue unit tests. getTsvFieldValue has it's own tests. 1464 * These tests make basic sanity checks on the getFieldValue wrapper. 1465 */ 1466 import std.exception; 1467 1468 assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123); 1469 assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4); 1470 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1)); 1471 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2)); 1472 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1)); 1473 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2)); 1474 } 1475 1476 /* Unit tests for the main program start here. 1477 * 1478 * Portability note: Many of the tests here rely on generating consistent random numbers 1479 * across different platforms when using the same random seed. So far this has succeeded 1480 * on several different platorm, compiler, and library versions. However, it is certainly 1481 * possible this condition will not hold on other platforms. 1482 * 1483 * For tsv-sample, this portability implies generating the same results on different 1484 * platforms when using the same random seed. This is NOT part of tsv-sample guarantees, 1485 * but it is convenient for testing. If platforms are identified that do not generate 1486 * the same results these tests will need to be adjusted. 1487 */ 1488 version(unittest) 1489 { 1490 /* Unit test helper functions. */ 1491 1492 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 1493 import std.conv : to; 1494 1495 void testTsvSample(string[] cmdArgs, string[][] expected) 1496 { 1497 import std.array : appender; 1498 import std.format : format; 1499 1500 assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty."); 1501 1502 auto formatAssertMessage(T...)(string msg, T formatArgs) 1503 { 1504 auto formatString = "[testTsvSample] %s: " ~ msg; 1505 return format(formatString, cmdArgs[0], formatArgs); 1506 } 1507 1508 TsvSampleOptions cmdopt; 1509 auto savedCmdArgs = cmdArgs.to!string; 1510 auto r = cmdopt.processArgs(cmdArgs); 1511 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 1512 auto output = appender!(char[])(); 1513 1514 tsvSample(cmdopt, output); // This invokes the main code line. 1515 1516 auto expectedOutput = expected.tsvDataToString; 1517 1518 assert(output.data == expectedOutput, 1519 formatAssertMessage( 1520 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 1521 expectedOutput.to!string, output.data.to!string)); 1522 } 1523 } 1524 1525 unittest 1526 { 1527 import std.path : buildPath; 1528 import std.file : rmdirRecurse; 1529 import std.format : format; 1530 1531 auto testDir = makeUnittestTempDir("tsv_sample"); 1532 scope(exit) testDir.rmdirRecurse; 1533 1534 /* Tabular data sets and expected results use the built-in static seed. 1535 * Tests are run by writing the data set to a file, then calling the main 1536 * routine to process. The function testTsvSample plays the role of the 1537 * main program. Rather than writing to expected output, the results are 1538 * matched against expected. The expected results were verified by hand 1539 * prior to inclusion in the test. 1540 * 1541 * The initial part of this section is simply setting up data files and 1542 * expected results. 1543 * 1544 * Expected results naming conventions: 1545 * - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected 1546 * - Sampling Type (required): Permute, Replace, Bernoulli, Distinct 1547 * - Compatibility: Compat, AlgoR, Skip, Swap 1548 * - Weight Field: Wt<num>, e.g. Wt3 1549 * - Sample Size: Num<num>, eg. Num3 1550 * - Seed Value: V<num>, eg. V77 1551 * - Key Field: K<num>, e.g. K2 1552 * - Probability: P<num>, e.g P05 (5%) 1553 * - Printing Probalities: Probs 1554 * - Printing Probs in order: ProbsInorder 1555 * - Printing Probs with custom header: RVCustom 1556 */ 1557 1558 /* Empty file. */ 1559 string[][] dataEmpty = []; 1560 string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv"); 1561 writeUnittestTsvFile(fpath_dataEmpty, dataEmpty); 1562 1563 /* 3x1, header only. */ 1564 string[][] data3x0 = [["field_a", "field_b", "field_c"]]; 1565 string fpath_data3x0 = buildPath(testDir, "data3x0.tsv"); 1566 writeUnittestTsvFile(fpath_data3x0, data3x0); 1567 1568 /* 3x1 */ 1569 string[][] data3x1 = 1570 [["field_a", "field_b", "field_c"], 1571 ["tan", "タン", "8.5"]]; 1572 1573 string fpath_data3x1 = buildPath(testDir, "data3x1.tsv"); 1574 string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv"); 1575 writeUnittestTsvFile(fpath_data3x1, data3x1); 1576 writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1..$]); 1577 1578 string[][] data3x1ExpectedReplaceNum3 = 1579 [["field_a", "field_b", "field_c"], 1580 ["tan", "タン", "8.5"], 1581 ["tan", "タン", "8.5"], 1582 ["tan", "タン", "8.5"]]; 1583 1584 /* 3x2 */ 1585 string[][] data3x2 = 1586 [["field_a", "field_b", "field_c"], 1587 ["brown", "褐色", "29.2"], 1588 ["gray", "グレー", "6.2"]]; 1589 1590 string fpath_data3x2 = buildPath(testDir, "data3x2.tsv"); 1591 string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv"); 1592 writeUnittestTsvFile(fpath_data3x2, data3x2); 1593 writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1..$]); 1594 1595 string[][] data3x2PermuteCompat = 1596 [["field_a", "field_b", "field_c"], 1597 ["gray", "グレー", "6.2"], 1598 ["brown", "褐色", "29.2"]]; 1599 1600 string[][] data3x2PermuteShuffle = 1601 [["field_a", "field_b", "field_c"], 1602 ["gray", "グレー", "6.2"], 1603 ["brown", "褐色", "29.2"]]; 1604 1605 /* 3x3 */ 1606 string[][] data3x3 = 1607 [["field_a", "field_b", "field_c"], 1608 ["orange", "オレンジ", "2.5"], 1609 ["pink", "ピンク", "1.1"], 1610 ["purple", "紫の", "42"]]; 1611 1612 string fpath_data3x3 = buildPath(testDir, "data3x3.tsv"); 1613 string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv"); 1614 writeUnittestTsvFile(fpath_data3x3, data3x3); 1615 writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1..$]); 1616 1617 string[][] data3x3ExpectedPermuteCompat = 1618 [["field_a", "field_b", "field_c"], 1619 ["purple", "紫の", "42"], 1620 ["pink", "ピンク", "1.1"], 1621 ["orange", "オレンジ", "2.5"]]; 1622 1623 string[][] data3x3ExpectedPermuteSwap = 1624 [["field_a", "field_b", "field_c"], 1625 ["purple", "紫の", "42"], 1626 ["orange", "オレンジ", "2.5"], 1627 ["pink", "ピンク", "1.1"]]; 1628 1629 /* 3x6 */ 1630 string[][] data3x6 = 1631 [["field_a", "field_b", "field_c"], 1632 ["red", "赤", "23.8"], 1633 ["green", "緑", "0.0072"], 1634 ["white", "白", "1.65"], 1635 ["yellow", "黄", "12"], 1636 ["blue", "青", "12"], 1637 ["black", "黒", "0.983"]]; 1638 string fpath_data3x6 = buildPath(testDir, "data3x6.tsv"); 1639 string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv"); 1640 writeUnittestTsvFile(fpath_data3x6, data3x6); 1641 writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1..$]); 1642 1643 // Randomization, all lines 1644 string[][] data3x6ExpectedPermuteCompat = 1645 [["field_a", "field_b", "field_c"], 1646 ["yellow", "黄", "12"], 1647 ["black", "黒", "0.983"], 1648 ["blue", "青", "12"], 1649 ["white", "白", "1.65"], 1650 ["green", "緑", "0.0072"], 1651 ["red", "赤", "23.8"]]; 1652 1653 string[][] data3x6ExpectedPermuteSwap = 1654 [["field_a", "field_b", "field_c"], 1655 ["black", "黒", "0.983"], 1656 ["green", "緑", "0.0072"], 1657 ["red", "赤", "23.8"], 1658 ["yellow", "黄", "12"], 1659 ["white", "白", "1.65"], 1660 ["blue", "青", "12"]]; 1661 1662 string[][] data3x6ExpectedPermuteCompatProbs = 1663 [["random_value", "field_a", "field_b", "field_c"], 1664 ["0.96055546286515892", "yellow", "黄", "12"], 1665 ["0.7571015392895788", "black", "黒", "0.983"], 1666 ["0.52525980887003243", "blue", "青", "12"], 1667 ["0.49287854949943721", "white", "白", "1.65"], 1668 ["0.15929344086907804", "green", "緑", "0.0072"], 1669 ["0.010968807619065046", "red", "赤", "23.8"]]; 1670 1671 /* Note: data3x6ExpectedAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because 1672 * both are effectively the same algorithm given that --num is data length. Both read 1673 * in the full data in order then call randomShuffle. 1674 */ 1675 string[][] data3x6ExpectedPermuteAlgoRNum6 = 1676 [["field_a", "field_b", "field_c"], 1677 ["black", "黒", "0.983"], 1678 ["green", "緑", "0.0072"], 1679 ["red", "赤", "23.8"], 1680 ["yellow", "黄", "12"], 1681 ["white", "白", "1.65"], 1682 ["blue", "青", "12"]]; 1683 1684 string[][] data3x6ExpectedPermuteAlgoRNum5 = 1685 [["field_a", "field_b", "field_c"], 1686 ["red", "赤", "23.8"], 1687 ["black", "黒", "0.983"], 1688 ["white", "白", "1.65"], 1689 ["green", "緑", "0.0072"], 1690 ["yellow", "黄", "12"]]; 1691 1692 string[][] data3x6ExpectedPermuteAlgoRNum4 = 1693 [["field_a", "field_b", "field_c"], 1694 ["blue", "青", "12"], 1695 ["green", "緑", "0.0072"], 1696 ["black", "黒", "0.983"], 1697 ["white", "白", "1.65"]]; 1698 1699 string[][] data3x6ExpectedPermuteAlgoRNum3 = 1700 [["field_a", "field_b", "field_c"], 1701 ["red", "赤", "23.8"], 1702 ["black", "黒", "0.983"], 1703 ["green", "緑", "0.0072"]]; 1704 1705 string[][] data3x6ExpectedPermuteAlgoRNum2 = 1706 [["field_a", "field_b", "field_c"], 1707 ["black", "黒", "0.983"], 1708 ["red", "赤", "23.8"]]; 1709 1710 string[][] data3x6ExpectedPermuteAlgoRNum1 = 1711 [["field_a", "field_b", "field_c"], 1712 ["green", "緑", "0.0072"]]; 1713 1714 string[][] data3x6ExpectedBernoulliProbsP100 = 1715 [["random_value", "field_a", "field_b", "field_c"], 1716 ["0.010968807619065046", "red", "赤", "23.8"], 1717 ["0.15929344086907804", "green", "緑", "0.0072"], 1718 ["0.49287854949943721", "white", "白", "1.65"], 1719 ["0.96055546286515892", "yellow", "黄", "12"], 1720 ["0.52525980887003243", "blue", "青", "12"], 1721 ["0.7571015392895788", "black", "黒", "0.983"]]; 1722 1723 string[][] data3x6ExpectedBernoulliCompatProbsP60 = 1724 [["random_value", "field_a", "field_b", "field_c"], 1725 ["0.010968807619065046", "red", "赤", "23.8"], 1726 ["0.15929344086907804", "green", "緑", "0.0072"], 1727 ["0.49287854949943721", "white", "白", "1.65"], 1728 ["0.52525980887003243", "blue", "青", "12"]]; 1729 1730 string[][] data3x6ExpectedBernoulliSkipP40 = 1731 [["field_a", "field_b", "field_c"], 1732 ["red", "赤", "23.8"], 1733 ["green", "緑", "0.0072"], 1734 ["yellow", "黄", "12"]]; 1735 1736 string[][] data3x6ExpectedBernoulliCompatP60 = 1737 [["field_a", "field_b", "field_c"], 1738 ["red", "赤", "23.8"], 1739 ["green", "緑", "0.0072"], 1740 ["white", "白", "1.65"], 1741 ["blue", "青", "12"]]; 1742 1743 string[][] data3x6ExpectedDistinctK1K3P60 = 1744 [["field_a", "field_b", "field_c"], 1745 ["green", "緑", "0.0072"], 1746 ["white", "白", "1.65"], 1747 ["blue", "青", "12"]]; 1748 1749 string[][] data3x6ExpectedDistinctK1K3P60Probs = 1750 [["random_value", "field_a", "field_b", "field_c"], 1751 ["0", "green", "緑", "0.0072"], 1752 ["0", "white", "白", "1.65"], 1753 ["0", "blue", "青", "12"]]; 1754 1755 string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom = 1756 [["custom_random_value_header", "field_a", "field_b", "field_c"], 1757 ["0", "green", "緑", "0.0072"], 1758 ["0", "white", "白", "1.65"], 1759 ["0", "blue", "青", "12"]]; 1760 1761 string[][] data3x6ExpectedDistinctK2P2ProbsInorder = 1762 [["random_value", "field_a", "field_b", "field_c"], 1763 ["1", "red", "赤", "23.8"], 1764 ["0", "green", "緑", "0.0072"], 1765 ["0", "white", "白", "1.65"], 1766 ["1", "yellow", "黄", "12"], 1767 ["3", "blue", "青", "12"], 1768 ["2", "black", "黒", "0.983"]]; 1769 1770 string[][] data3x6ExpectedPermuteWt3Probs = 1771 [["random_value", "field_a", "field_b", "field_c"], 1772 ["0.9966519875764539", "yellow", "黄", "12"], 1773 ["0.94775884809836686", "blue", "青", "12"], 1774 ["0.82728234682286661", "red", "赤", "23.8"], 1775 ["0.75346697377181959", "black", "黒", "0.983"], 1776 ["0.65130103496422487", "white", "白", "1.65"], 1777 ["1.5636943712879866e-111", "green", "緑", "0.0072"]]; 1778 1779 string[][] data3x6ExpectedWt3ProbsInorder = 1780 [["random_value", "field_a", "field_b", "field_c"], 1781 ["0.82728234682286661", "red", "赤", "23.8"], 1782 ["1.5636943712879866e-111", "green", "緑", "0.0072"], 1783 ["0.65130103496422487", "white", "白", "1.65"], 1784 ["0.9966519875764539", "yellow", "黄", "12"], 1785 ["0.94775884809836686", "blue", "青", "12"], 1786 ["0.75346697377181959", "black", "黒", "0.983"]]; 1787 1788 string[][] data3x6ExpectedPermuteWt3 = 1789 [["field_a", "field_b", "field_c"], 1790 ["yellow", "黄", "12"], 1791 ["blue", "青", "12"], 1792 ["red", "赤", "23.8"], 1793 ["black", "黒", "0.983"], 1794 ["white", "白", "1.65"], 1795 ["green", "緑", "0.0072"]]; 1796 1797 string[][] data3x6ExpectedReplaceNum10 = 1798 [["field_a", "field_b", "field_c"], 1799 ["black", "黒", "0.983"], 1800 ["green", "緑", "0.0072"], 1801 ["green", "緑", "0.0072"], 1802 ["red", "赤", "23.8"], 1803 ["yellow", "黄", "12"], 1804 ["red", "赤", "23.8"], 1805 ["white", "白", "1.65"], 1806 ["yellow", "黄", "12"], 1807 ["yellow", "黄", "12"], 1808 ["white", "白", "1.65"], 1809 ]; 1810 1811 string[][] data3x6ExpectedReplaceNum10V77 = 1812 [["field_a", "field_b", "field_c"], 1813 ["black", "黒", "0.983"], 1814 ["red", "赤", "23.8"], 1815 ["black", "黒", "0.983"], 1816 ["yellow", "黄", "12"], 1817 ["green", "緑", "0.0072"], 1818 ["green", "緑", "0.0072"], 1819 ["green", "緑", "0.0072"], 1820 ["yellow", "黄", "12"], 1821 ["blue", "青", "12"], 1822 ["white", "白", "1.65"], 1823 ]; 1824 1825 /* Using a different static seed. */ 1826 string[][] data3x6ExpectedPermuteCompatV41Probs = 1827 [["random_value", "field_a", "field_b", "field_c"], 1828 ["0.68057272653095424", "green", "緑", "0.0072"], 1829 ["0.67681624367833138", "blue", "青", "12"], 1830 ["0.32097338931635022", "yellow", "黄", "12"], 1831 ["0.25092361867427826", "red", "赤", "23.8"], 1832 ["0.15535934292711318", "black", "黒", "0.983"], 1833 ["0.04609582107514143", "white", "白", "1.65"]]; 1834 1835 string[][] data3x6ExpectedBernoulliCompatP60V41Probs = 1836 [["random_value", "field_a", "field_b", "field_c"], 1837 ["0.25092361867427826", "red", "赤", "23.8"], 1838 ["0.04609582107514143", "white", "白", "1.65"], 1839 ["0.32097338931635022", "yellow", "黄", "12"], 1840 ["0.15535934292711318", "black", "黒", "0.983"]]; 1841 1842 string[][] data3x6ExpectedPermuteWt3V41Probs = 1843 [["random_value", "field_a", "field_b", "field_c"], 1844 ["0.96799377498910666", "blue", "青", "12"], 1845 ["0.94356245792573568", "red", "赤", "23.8"], 1846 ["0.90964601024271996", "yellow", "黄", "12"], 1847 ["0.15491658409260103", "white", "白", "1.65"], 1848 ["0.15043620392537033", "black", "黒", "0.983"], 1849 ["6.1394674830701461e-24", "green", "緑", "0.0072"]]; 1850 1851 string[][] data3x6ExpectedWt3V41ProbsInorder = 1852 [["random_value", "field_a", "field_b", "field_c"], 1853 ["0.94356245792573568", "red", "赤", "23.8"], 1854 ["6.1394674830701461e-24", "green", "緑", "0.0072"], 1855 ["0.15491658409260103", "white", "白", "1.65"], 1856 ["0.90964601024271996", "yellow", "黄", "12"], 1857 ["0.96799377498910666", "blue", "青", "12"], 1858 ["0.15043620392537033", "black", "黒", "0.983"]]; 1859 1860 1861 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 1862 string[][] combo1ExpectedPermuteCompat = 1863 [["field_a", "field_b", "field_c"], 1864 ["yellow", "黄", "12"], 1865 ["tan", "タン", "8.5"], 1866 ["brown", "褐色", "29.2"], 1867 ["green", "緑", "0.0072"], 1868 ["red", "赤", "23.8"], 1869 ["purple", "紫の", "42"], 1870 ["black", "黒", "0.983"], 1871 ["white", "白", "1.65"], 1872 ["gray", "グレー", "6.2"], 1873 ["blue", "青", "12"], 1874 ["pink", "ピンク", "1.1"], 1875 ["orange", "オレンジ", "2.5"]]; 1876 1877 string[][] combo1ExpectedPermuteCompatProbs = 1878 [["random_value", "field_a", "field_b", "field_c"], 1879 ["0.97088520275428891", "yellow", "黄", "12"], 1880 ["0.96055546286515892", "tan", "タン", "8.5"], 1881 ["0.81756894313730299", "brown", "褐色", "29.2"], 1882 ["0.7571015392895788", "green", "緑", "0.0072"], 1883 ["0.52525980887003243", "red", "赤", "23.8"], 1884 ["0.49287854949943721", "purple", "紫の", "42"], 1885 ["0.47081507067196071", "black", "黒", "0.983"], 1886 ["0.38388182921335101", "white", "白", "1.65"], 1887 ["0.29215990612283349", "gray", "グレー", "6.2"], 1888 ["0.24033216014504433", "blue", "青", "12"], 1889 ["0.15929344086907804", "pink", "ピンク", "1.1"], 1890 ["0.010968807619065046", "orange", "オレンジ", "2.5"]]; 1891 1892 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 1893 string[][] combo1ExpectedProbsInorder = 1894 [["random_value", "field_a", "field_b", "field_c"], 1895 ["0.010968807619065046", "orange", "オレンジ", "2.5"], 1896 ["0.15929344086907804", "pink", "ピンク", "1.1"], 1897 ["0.49287854949943721", "purple", "紫の", "42"], 1898 ["0.96055546286515892", "tan", "タン", "8.5"], 1899 ["0.52525980887003243", "red", "赤", "23.8"], 1900 ["0.7571015392895788", "green", "緑", "0.0072"], 1901 ["0.38388182921335101", "white", "白", "1.65"], 1902 ["0.97088520275428891", "yellow", "黄", "12"], 1903 ["0.24033216014504433", "blue", "青", "12"], 1904 ["0.47081507067196071", "black", "黒", "0.983"], 1905 ["0.81756894313730299", "brown", "褐色", "29.2"], 1906 ["0.29215990612283349", "gray", "グレー", "6.2"]]; 1907 1908 string[][] combo1ExpectedBernoulliCompatP50Probs = 1909 [["random_value", "field_a", "field_b", "field_c"], 1910 ["0.010968807619065046", "orange", "オレンジ", "2.5"], 1911 ["0.15929344086907804", "pink", "ピンク", "1.1"], 1912 ["0.49287854949943721", "purple", "紫の", "42"], 1913 ["0.38388182921335101", "white", "白", "1.65"], 1914 ["0.24033216014504433", "blue", "青", "12"], 1915 ["0.47081507067196071", "black", "黒", "0.983"], 1916 ["0.29215990612283349", "gray", "グレー", "6.2"]]; 1917 1918 string[][] combo1ExpectedBernoulliCompatP40 = 1919 [["field_a", "field_b", "field_c"], 1920 ["orange", "オレンジ", "2.5"], 1921 ["pink", "ピンク", "1.1"], 1922 ["white", "白", "1.65"], 1923 ["blue", "青", "12"], 1924 ["gray", "グレー", "6.2"]]; 1925 1926 string[][] combo1ExpectedDistinctK1P40 = 1927 [["field_a", "field_b", "field_c"], 1928 ["orange", "オレンジ", "2.5"], 1929 ["red", "赤", "23.8"], 1930 ["green", "緑", "0.0072"], 1931 ["blue", "青", "12"], 1932 ["black", "黒", "0.983"]]; 1933 1934 string[][] combo1ExpectedPermuteWt3Probs = 1935 [["random_value", "field_a", "field_b", "field_c"], 1936 ["0.99754077523718754", "yellow", "黄", "12"], 1937 ["0.99527665440088786", "tan", "タン", "8.5"], 1938 ["0.99312578945741659", "brown", "褐色", "29.2"], 1939 ["0.98329602553389361", "purple", "紫の", "42"], 1940 ["0.9733096193808366", "red", "赤", "23.8"], 1941 ["0.88797551521739648", "blue", "青", "12"], 1942 ["0.81999230489041786", "gray", "グレー", "6.2"], 1943 ["0.55975569204250941", "white", "白", "1.65"], 1944 ["0.46472135609205739", "black", "黒", "0.983"], 1945 ["0.18824582704191337", "pink", "ピンク", "1.1"], 1946 ["0.1644613185329992", "orange", "オレンジ", "2.5"], 1947 ["1.6438086931020549e-17", "green", "緑", "0.0072"]]; 1948 1949 string[][] combo1ExpectedPermuteWt3 = 1950 [["field_a", "field_b", "field_c"], 1951 ["yellow", "黄", "12"], 1952 ["tan", "タン", "8.5"], 1953 ["brown", "褐色", "29.2"], 1954 ["purple", "紫の", "42"], 1955 ["red", "赤", "23.8"], 1956 ["blue", "青", "12"], 1957 ["gray", "グレー", "6.2"], 1958 ["white", "白", "1.65"], 1959 ["black", "黒", "0.983"], 1960 ["pink", "ピンク", "1.1"], 1961 ["orange", "オレンジ", "2.5"], 1962 ["green", "緑", "0.0072"]]; 1963 1964 string[][] combo1ExpectedPermuteAlgoRNum4 = 1965 [["field_a", "field_b", "field_c"], 1966 ["blue", "青", "12"], 1967 ["gray", "グレー", "6.2"], 1968 ["brown", "褐色", "29.2"], 1969 ["white", "白", "1.65"]]; 1970 1971 string[][] combo1ExpectedReplaceNum10 = 1972 [["field_a", "field_b", "field_c"], 1973 ["gray", "グレー", "6.2"], 1974 ["yellow", "黄", "12"], 1975 ["yellow", "黄", "12"], 1976 ["white", "白", "1.65"], 1977 ["tan", "タン", "8.5"], 1978 ["white", "白", "1.65"], 1979 ["blue", "青", "12"], 1980 ["black", "黒", "0.983"], 1981 ["tan", "タン", "8.5"], 1982 ["purple", "紫の", "42"]]; 1983 1984 /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */ 1985 string[][] data1x200 = 1986 [["field_a"], 1987 ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"], 1988 ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"], 1989 ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"], 1990 ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"], 1991 ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"], 1992 ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"], 1993 ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"], 1994 ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"], 1995 ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"], 1996 ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"], 1997 ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"], 1998 ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"], 1999 ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"], 2000 ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"], 2001 ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"], 2002 ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"], 2003 ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"], 2004 ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"], 2005 ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"], 2006 ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"], 2007 ]; 2008 2009 string fpath_data1x200 = buildPath(testDir, "data1x200.tsv"); 2010 string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv"); 2011 writeUnittestTsvFile(fpath_data1x200, data1x200); 2012 writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1..$]); 2013 2014 string[][] data1x200ExpectedBernoulliSkipV333P01 = 2015 [["field_a"], 2016 ["077"], 2017 ["119"]]; 2018 2019 string[][] data1x200ExpectedBernoulliSkipV333P02 = 2020 [["field_a"], 2021 ["038"], 2022 ["059"], 2023 ["124"], 2024 ["161"], 2025 ["162"], 2026 ["183"]]; 2027 2028 string[][] data1x200ExpectedBernoulliSkipV333P03 = 2029 [["field_a"], 2030 ["025"], 2031 ["039"], 2032 ["082"], 2033 ["107"], 2034 ["108"], 2035 ["122"], 2036 ["136"], 2037 ["166"], 2038 ["182"]]; 2039 2040 string[][] data1x200ExpectedBernoulliCompatV333P01 = 2041 [["field_a"], 2042 ["072"]]; 2043 2044 string[][] data1x200ExpectedBernoulliCompatV333P02 = 2045 [["field_a"], 2046 ["004"], 2047 ["072"]]; 2048 2049 string[][] data1x200ExpectedBernoulliCompatV333P03 = 2050 [["field_a"], 2051 ["004"], 2052 ["072"], 2053 ["181"]]; 2054 2055 /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files, 2056 * only expected results. The header is from 3x0, the results are offset 1-position 2057 * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line. 2058 */ 2059 string[][] combo2ExpectedBernoulliSkipV333P03 = 2060 [["field_a", "field_b", "field_c"], 2061 ["024"], 2062 ["038"], 2063 ["081"], 2064 ["106"], 2065 ["107"], 2066 ["121"], 2067 ["135"], 2068 ["165"], 2069 ["181"]]; 2070 2071 2072 /* 1x10 - Simple 1-column file. */ 2073 string[][] data1x10 = 2074 [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]]; 2075 string fpath_data1x10 = buildPath(testDir, "data1x10.tsv"); 2076 string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv"); 2077 writeUnittestTsvFile(fpath_data1x10, data1x10); 2078 writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1..$]); 2079 2080 string[][] data1x10ExpectedPermuteCompat = 2081 [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]]; 2082 2083 string[][] data1x10ExpectedPermuteWt1 = 2084 [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]]; 2085 2086 /* 2x10a - Uniform distribution [0,1]. */ 2087 string[][] data2x10a = 2088 [["line", "weight"], 2089 ["1", "0.26788837"], 2090 ["2", "0.06601298"], 2091 ["3", "0.38627527"], 2092 ["4", "0.47379424"], 2093 ["5", "0.02966641"], 2094 ["6", "0.05636231"], 2095 ["7", "0.70529242"], 2096 ["8", "0.91836862"], 2097 ["9", "0.99103720"], 2098 ["10", "0.31401740"]]; 2099 2100 string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv"); 2101 writeUnittestTsvFile(fpath_data2x10a, data2x10a); 2102 2103 string[][] data2x10aExpectedPermuteWt2Probs = 2104 [["random_value", "line", "weight"], 2105 ["0.96833865494543658", "8", "0.91836862"], 2106 ["0.91856842054413923", "4", "0.47379424"], 2107 ["0.25730832087795091", "7", "0.70529242"], 2108 ["0.2372531790701812", "9", "0.99103720"], 2109 ["0.16016096701872204", "3", "0.38627527"], 2110 ["0.090819662667243381", "10", "0.31401740"], 2111 ["0.0071764539244361172", "6", "0.05636231"], 2112 ["4.8318642951630057e-08", "1", "0.26788837"], 2113 ["3.7525692966535517e-10", "5", "0.02966641"], 2114 ["8.2123247880095796e-13", "2", "0.06601298"]]; 2115 2116 /* 2x10b - Uniform distribution [0,1000]. */ 2117 string[][] data2x10b = 2118 [["line", "weight"], 2119 ["1", "761"], 2120 ["2", "432"], 2121 ["3", "103"], 2122 ["4", "448"], 2123 ["5", "750"], 2124 ["6", "711"], 2125 ["7", "867"], 2126 ["8", "841"], 2127 ["9", "963"], 2128 ["10", "784"]]; 2129 2130 string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv"); 2131 writeUnittestTsvFile(fpath_data2x10b, data2x10b); 2132 2133 string[][] data2x10bExpectedPermuteWt2Probs = 2134 [["random_value", "line", "weight"], 2135 ["0.99996486739067969", "8", "841"], 2136 ["0.99991017467137211", "4", "448"], 2137 ["0.99960871524873662", "6", "711"], 2138 ["0.999141885371438", "5", "750"], 2139 ["0.99903963250274785", "10", "784"], 2140 ["0.99889631825931946", "7", "867"], 2141 ["0.99852058315191139", "9", "963"], 2142 ["0.99575669679158918", "2", "432"], 2143 ["0.99408758732050595", "1", "761"], 2144 ["0.99315467761212362", "3", "103"]]; 2145 2146 /* 2x10c - Logarithmic distribution in random order. */ 2147 string[][] data2x10c = 2148 [["line", "weight"], 2149 ["1", "31.85"], 2150 ["2", "17403.31"], 2151 ["3", "653.84"], 2152 ["4", "8.23"], 2153 ["5", "2671.04"], 2154 ["6", "26226.08"], 2155 ["7", "1.79"], 2156 ["8", "354.56"], 2157 ["9", "35213.81"], 2158 ["10", "679.29"]]; 2159 2160 string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv"); 2161 writeUnittestTsvFile(fpath_data2x10c, data2x10c); 2162 2163 string[][] data2x10cExpectedPermuteWt2Probs = 2164 [["random_value", "line", "weight"], 2165 ["0.99998939008709697", "6", "26226.08"], 2166 ["0.99995951291695517", "9", "35213.81"], 2167 ["0.99991666907613541", "8", "354.56"], 2168 ["0.9998944505218641", "2", "17403.31"], 2169 ["0.9997589760286163", "5", "2671.04"], 2170 ["0.99891852769877643", "3", "653.84"], 2171 ["0.99889167752782515", "10", "679.29"], 2172 ["0.99512207506850148", "4", "8.23"], 2173 ["0.86789371584259023", "1", "31.85"], 2174 ["0.5857443816291561", "7", "1.79"]]; 2175 2176 /* 2x10d. Logarithmic distribution in ascending order. */ 2177 string[][] data2x10d = 2178 [["line", "weight"], 2179 ["1", "1.79"], 2180 ["2", "8.23"], 2181 ["3", "31.85"], 2182 ["4", "354.56"], 2183 ["5", "653.84"], 2184 ["6", "679.29"], 2185 ["7", "2671.04"], 2186 ["8", "17403.31"], 2187 ["9", "26226.08"], 2188 ["10", "35213.81"]]; 2189 2190 string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv"); 2191 writeUnittestTsvFile(fpath_data2x10d, data2x10d); 2192 2193 string[][] data2x10dExpectedPermuteWt2Probs = 2194 [["random_value", "line", "weight"], 2195 ["0.99999830221846353", "8", "17403.31"], 2196 ["0.99997860834041397", "10", "35213.81"], 2197 ["0.99994563828986716", "9", "26226.08"], 2198 ["0.99988650363575737", "4", "354.56"], 2199 ["0.99964161939190088", "7", "2671.04"], 2200 ["0.99959045338948649", "6", "679.29"], 2201 ["0.99901574490639788", "5", "653.84"], 2202 ["0.97803163304747431", "3", "31.85"], 2203 ["0.79994791806910948", "2", "8.23"], 2204 ["0.080374261239949119", "1", "1.79"]]; 2205 2206 /* 2x10e. Logarithmic distribution in descending order. */ 2207 string[][] data2x10e = 2208 [["line", "weight"], 2209 ["1", "35213.81"], 2210 ["2", "26226.08"], 2211 ["3", "17403.31"], 2212 ["4", "2671.04"], 2213 ["5", "679.29"], 2214 ["6", "653.84"], 2215 ["7", "354.56"], 2216 ["8", "31.85"], 2217 ["9", "8.23"], 2218 ["10", "1.79"]]; 2219 string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv"); 2220 writeUnittestTsvFile(fpath_data2x10e, data2x10e); 2221 2222 string[][] data2x10eExpectedPermuteWt2Probs = 2223 [["random_value", "line", "weight"], 2224 ["0.99998493348975237", "4", "2671.04"], 2225 ["0.99995934807202624", "3", "17403.31"], 2226 ["0.99992995739727453", "2", "26226.08"], 2227 ["0.99987185679245649", "1", "35213.81"], 2228 ["0.99957451563173938", "6", "653.84"], 2229 ["0.99907273650209583", "8", "31.85"], 2230 ["0.99905260312968946", "5", "679.29"], 2231 ["0.99730333650516401", "7", "354.56"], 2232 ["0.84093902435227808", "9", "8.23"], 2233 ["0.65650015926290028", "10", "1.79"]]; 2234 2235 /* Data sets for distinct sampling. */ 2236 string[][] data5x25 = 2237 [["ID", "Shape", "Color", "Size", "Weight"], 2238 ["01", "circle", "red", "S", "10"], 2239 ["02", "circle", "black", "L", "20"], 2240 ["03", "square", "black", "L", "20"], 2241 ["04", "circle", "green", "L", "30"], 2242 ["05", "ellipse", "red", "S", "20"], 2243 ["06", "triangle", "red", "S", "10"], 2244 ["07", "triangle", "red", "L", "20"], 2245 ["08", "square", "black", "S", "10"], 2246 ["09", "circle", "black", "S", "20"], 2247 ["10", "square", "green", "L", "20"], 2248 ["11", "triangle", "red", "L", "20"], 2249 ["12", "circle", "green", "L", "30"], 2250 ["13", "ellipse", "red", "S", "20"], 2251 ["14", "circle", "green", "L", "30"], 2252 ["15", "ellipse", "red", "L", "30"], 2253 ["16", "square", "red", "S", "10"], 2254 ["17", "circle", "black", "L", "20"], 2255 ["18", "square", "red", "S", "20"], 2256 ["19", "square", "black", "L", "20"], 2257 ["20", "circle", "red", "S", "10"], 2258 ["21", "ellipse", "black", "L", "30"], 2259 ["22", "triangle", "red", "L", "30"], 2260 ["23", "circle", "green", "S", "20"], 2261 ["24", "square", "green", "L", "20"], 2262 ["25", "circle", "red", "S", "10"], 2263 ]; 2264 2265 string fpath_data5x25 = buildPath(testDir, "data5x25.tsv"); 2266 string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv"); 2267 writeUnittestTsvFile(fpath_data5x25, data5x25); 2268 writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1..$]); 2269 2270 string[][] data5x25ExpectedDistinctK2P40 = 2271 [["ID", "Shape", "Color", "Size", "Weight"], 2272 ["03", "square", "black", "L", "20"], 2273 ["05", "ellipse", "red", "S", "20"], 2274 ["08", "square", "black", "S", "10"], 2275 ["10", "square", "green", "L", "20"], 2276 ["13", "ellipse", "red", "S", "20"], 2277 ["15", "ellipse", "red", "L", "30"], 2278 ["16", "square", "red", "S", "10"], 2279 ["18", "square", "red", "S", "20"], 2280 ["19", "square", "black", "L", "20"], 2281 ["21", "ellipse", "black", "L", "30"], 2282 ["24", "square", "green", "L", "20"], 2283 ]; 2284 2285 string[][] data5x25ExpectedDistinctK2K4P20 = 2286 [["ID", "Shape", "Color", "Size", "Weight"], 2287 ["03", "square", "black", "L", "20"], 2288 ["07", "triangle", "red", "L", "20"], 2289 ["08", "square", "black", "S", "10"], 2290 ["10", "square", "green", "L", "20"], 2291 ["11", "triangle", "red", "L", "20"], 2292 ["16", "square", "red", "S", "10"], 2293 ["18", "square", "red", "S", "20"], 2294 ["19", "square", "black", "L", "20"], 2295 ["22", "triangle", "red", "L", "30"], 2296 ["24", "square", "green", "L", "20"], 2297 ]; 2298 2299 string[][] data5x25ExpectedDistinctK2K3K4P20 = 2300 [["ID", "Shape", "Color", "Size", "Weight"], 2301 ["04", "circle", "green", "L", "30"], 2302 ["07", "triangle", "red", "L", "20"], 2303 ["09", "circle", "black", "S", "20"], 2304 ["11", "triangle", "red", "L", "20"], 2305 ["12", "circle", "green", "L", "30"], 2306 ["14", "circle", "green", "L", "30"], 2307 ["16", "square", "red", "S", "10"], 2308 ["18", "square", "red", "S", "20"], 2309 ["22", "triangle", "red", "L", "30"], 2310 ]; 2311 2312 /* Fields 2 and 4 from data5x25. Distinct rows should be the same for equiv keys. */ 2313 string[][] data2x25 = 2314 [["Shape", "Size"], 2315 ["circle", "S"], 2316 ["circle", "L"], 2317 ["square", "L"], 2318 ["circle", "L"], 2319 ["ellipse", "S"], 2320 ["triangle", "S"], 2321 ["triangle", "L"], 2322 ["square", "S"], 2323 ["circle", "S"], 2324 ["square", "L"], 2325 ["triangle", "L"], 2326 ["circle", "L"], 2327 ["ellipse", "S"], 2328 ["circle", "L"], 2329 ["ellipse", "L"], 2330 ["square", "S"], 2331 ["circle", "L"], 2332 ["square", "S"], 2333 ["square", "L"], 2334 ["circle", "S"], 2335 ["ellipse", "L"], 2336 ["triangle", "L"], 2337 ["circle", "S"], 2338 ["square", "L"], 2339 ["circle", "S"], 2340 ]; 2341 2342 string fpath_data2x25 = buildPath(testDir, "data2x25.tsv"); 2343 string fpath_data2x25_noheader = buildPath(testDir, "data2x25_noheader.tsv"); 2344 writeUnittestTsvFile(fpath_data2x25, data2x25); 2345 writeUnittestTsvFile(fpath_data2x25_noheader, data2x25[1..$]); 2346 2347 string[][] data2x25ExpectedDistinctK1K2P20 = 2348 [["Shape", "Size"], 2349 ["square", "L"], 2350 ["triangle", "L"], 2351 ["square", "S"], 2352 ["square", "L"], 2353 ["triangle", "L"], 2354 ["square", "S"], 2355 ["square", "S"], 2356 ["square", "L"], 2357 ["triangle", "L"], 2358 ["square", "L"], 2359 ]; 2360 2361 string[][] data1x25 = 2362 [["Shape-Size"], 2363 ["circle-S"], 2364 ["circle-L"], 2365 ["square-L"], 2366 ["circle-L"], 2367 ["ellipse-S"], 2368 ["triangle-S"], 2369 ["triangle-L"], 2370 ["square-S"], 2371 ["circle-S"], 2372 ["square-L"], 2373 ["triangle-L"], 2374 ["circle-L"], 2375 ["ellipse-S"], 2376 ["circle-L"], 2377 ["ellipse-L"], 2378 ["square-S"], 2379 ["circle-L"], 2380 ["square-S"], 2381 ["square-L"], 2382 ["circle-S"], 2383 ["ellipse-L"], 2384 ["triangle-L"], 2385 ["circle-S"], 2386 ["square-L"], 2387 ["circle-S"], 2388 ]; 2389 2390 string fpath_data1x25 = buildPath(testDir, "data1x25.tsv"); 2391 string fpath_data1x25_noheader = buildPath(testDir, "data1x25_noheader.tsv"); 2392 writeUnittestTsvFile(fpath_data1x25, data1x25); 2393 writeUnittestTsvFile(fpath_data1x25_noheader, data1x25[1..$]); 2394 2395 string[][] data1x25ExpectedDistinctK1P20 = 2396 [["Shape-Size"], 2397 ["triangle-L"], 2398 ["square-S"], 2399 ["triangle-L"], 2400 ["ellipse-L"], 2401 ["square-S"], 2402 ["square-S"], 2403 ["ellipse-L"], 2404 ["triangle-L"], 2405 ]; 2406 2407 string[][] data1x25ExpectedDistinctK1P20Probs = 2408 [["random_value", "Shape-Size"], 2409 ["0", "triangle-L"], 2410 ["0", "square-S"], 2411 ["0", "triangle-L"], 2412 ["0", "ellipse-L"], 2413 ["0", "square-S"], 2414 ["0", "square-S"], 2415 ["0", "ellipse-L"], 2416 ["0", "triangle-L"], 2417 ]; 2418 2419 string[][] data1x25ExpectedDistinctK1P20ProbsInorder = 2420 [["random_value", "Shape-Size"], 2421 ["1", "circle-S"], 2422 ["4", "circle-L"], 2423 ["2", "square-L"], 2424 ["4", "circle-L"], 2425 ["2", "ellipse-S"], 2426 ["1", "triangle-S"], 2427 ["0", "triangle-L"], 2428 ["0", "square-S"], 2429 ["1", "circle-S"], 2430 ["2", "square-L"], 2431 ["0", "triangle-L"], 2432 ["4", "circle-L"], 2433 ["2", "ellipse-S"], 2434 ["4", "circle-L"], 2435 ["0", "ellipse-L"], 2436 ["0", "square-S"], 2437 ["4", "circle-L"], 2438 ["0", "square-S"], 2439 ["2", "square-L"], 2440 ["1", "circle-S"], 2441 ["0", "ellipse-L"], 2442 ["0", "triangle-L"], 2443 ["1", "circle-S"], 2444 ["2", "square-L"], 2445 ["1", "circle-S"], 2446 ]; 2447 2448 /* 2449 * Enough setup! Actually run some tests! 2450 */ 2451 2452 /* Permutations. Headers, static seed, compatibility mode. With weights and without. */ 2453 testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty); 2454 testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0); 2455 testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1); 2456 testTsvSample(["test-a4", "-H", "-s", "--compatibility-mode", fpath_data3x2], data3x2PermuteCompat); 2457 testTsvSample(["test-a5", "-H", "-s", "--compatibility-mode", fpath_data3x3], data3x3ExpectedPermuteCompat); 2458 testTsvSample(["test-a6", "-H", "-s", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat); 2459 testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs); 2460 testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3); 2461 testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 2462 testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 2463 testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 2464 testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs); 2465 testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs); 2466 2467 /* Permutations, without compatibility mode, or with both compatibility and printing. */ 2468 testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty); 2469 testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0); 2470 testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1); 2471 testTsvSample(["test-aa4", "-H", "-s", fpath_data3x2], data3x2PermuteShuffle); 2472 testTsvSample(["test-aa5", "-H", "-s", fpath_data3x3], data3x3ExpectedPermuteSwap); 2473 testTsvSample(["test-aa6", "-H", "-s", fpath_data3x6], data3x6ExpectedPermuteSwap); 2474 testTsvSample(["test-aa7", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3); 2475 testTsvSample(["test-aa8", "-H", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 2476 testTsvSample(["test-aa9", "-H", "--seed-value", "41", "--print-random", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 2477 2478 /* Reservoir sampling using Algorithm R. 2479 * (Note: reservoirSamplingViaHeap is tested later in the length-based iteration loops.) 2480 */ 2481 testTsvSample(["test-aa10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty); 2482 testTsvSample(["test-aa11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty); 2483 testTsvSample(["test-aa12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x0], data3x0); 2484 testTsvSample(["test-aa13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x0], data3x0); 2485 testTsvSample(["test-aa14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x1], data3x1); 2486 testTsvSample(["test-aa15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x1], data3x1); 2487 testTsvSample(["test-aa16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum6); 2488 testTsvSample(["test-aa17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum6); 2489 testTsvSample(["test-aa18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum5); 2490 testTsvSample(["test-aa19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum4); 2491 testTsvSample(["test-aa20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum3); 2492 testTsvSample(["test-aa21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum2); 2493 testTsvSample(["test-aa22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum1); 2494 2495 /* Bernoulli sampling cases. */ 2496 testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty); 2497 testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0); 2498 testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1); 2499 testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6); 2500 testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6); 2501 testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 2502 testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60); 2503 testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliCompatP60); 2504 testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs); 2505 2506 /* Bernoulli sampling with probabilities in skip sampling range or preferring skip sampling. */ 2507 testTsvSample(["test-ab1", "-H", "--seed-value", "333", "--prob", "0.01", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P01); 2508 testTsvSample(["test-ab2", "-H", "--seed-value", "333", "--prob", "0.02", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P02); 2509 testTsvSample(["test-ab3", "-H", "--seed-value", "333", "--prob", "0.03", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03); 2510 testTsvSample(["test-ab4", "-H", "--seed-value", "333", "--prob", "0.01", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P01); 2511 testTsvSample(["test-ab5", "-H", "--seed-value", "333", "--prob", "0.02", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P02); 2512 testTsvSample(["test-ab6", "-H", "--seed-value", "333", "--prob", "0.03", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P03); 2513 testTsvSample(["test-ab7", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40); 2514 2515 /* Distinct sampling cases. */ 2516 testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty); 2517 testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0); 2518 testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1); 2519 testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6); 2520 testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctK1K3P60); 2521 2522 2523 2524 /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling. 2525 * For weighted sampling, use the weighted cases, but with expected using the original ordering. 2526 */ 2527 testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 2528 testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 2529 testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], 2530 data3x6ExpectedWt3ProbsInorder); 2531 testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], 2532 data3x6ExpectedWt3V41ProbsInorder); 2533 testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6], 2534 data3x6ExpectedDistinctK1K3P60Probs); 2535 testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header", 2536 "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctK1K3P60ProbsRVCustom); 2537 testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6], 2538 data3x6ExpectedDistinctK2P2ProbsInorder); 2539 2540 /* Simple random sampling with replacement. */ 2541 testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty); 2542 testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty); 2543 testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0); 2544 testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0); 2545 testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplaceNum3); 2546 testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10); 2547 testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77); 2548 2549 /* Permutations, compatibility mode, without headers. */ 2550 testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1..$]); 2551 testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1..$]); 2552 testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1..$]); 2553 testTsvSample(["test-b4", "-s", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..$]); 2554 testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..$]); 2555 testTsvSample(["test-b6", "-s", "--weight-field", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..$]); 2556 testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..$]); 2557 testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1..$]); 2558 testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1..$]); 2559 2560 /* Permutations, no headers, without compatibility mode, or with printing and compatibility mode. */ 2561 testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1..$]); 2562 testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1..$]); 2563 testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1..$]); 2564 testTsvSample(["test-bb4", "-s", fpath_data3x6_noheader], data3x6ExpectedPermuteSwap[1..$]); 2565 testTsvSample(["test-bb5", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..$]); 2566 testTsvSample(["test-bb6", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..$]); 2567 testTsvSample(["test-bb7", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1..$]); 2568 2569 /* Reservoir sampling using Algorithm R, no headers. */ 2570 testTsvSample(["test-aa10", "--prefer-algorithm-r", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty); 2571 testTsvSample(["test-aa11", "--prefer-algorithm-r", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty); 2572 testTsvSample(["test-aa14", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x1_noheader], data3x1[1..$]); 2573 testTsvSample(["test-aa15", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x1_noheader], data3x1[1..$]); 2574 testTsvSample(["test-aa16", "--prefer-algorithm-r", "-s", "--num", "7", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum6[1..$]); 2575 testTsvSample(["test-aa17", "--prefer-algorithm-r", "-s", "--num", "6", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum6[1..$]); 2576 testTsvSample(["test-aa18", "--prefer-algorithm-r", "-s", "--num", "5", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum5[1..$]); 2577 testTsvSample(["test-aa19", "--prefer-algorithm-r", "-s", "--num", "4", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum4[1..$]); 2578 testTsvSample(["test-aa20", "--prefer-algorithm-r", "-s", "--num", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum3[1..$]); 2579 testTsvSample(["test-aa21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum2[1..$]); 2580 testTsvSample(["test-aa22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum1[1..$]); 2581 2582 /* Bernoulli sampling cases. */ 2583 testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1..$]); 2584 testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]); 2585 testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]); 2586 testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..$]); 2587 testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..$]); 2588 testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1..$]); 2589 2590 /* Bernoulli sampling with probabilities in skip sampling range. */ 2591 testTsvSample(["test-bb1", "-v", "333", "-p", "0.01", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P01[1..$]); 2592 testTsvSample(["test-bb2", "-v", "333", "-p", "0.02", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P02[1..$]); 2593 testTsvSample(["test-bb3", "-v", "333", "-p", "0.03", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..$]); 2594 testTsvSample(["test-bb4", "-v", "333", "-p", "0.01", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P01[1..$]); 2595 testTsvSample(["test-bb5", "-v", "333", "-p", "0.02", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P02[1..$]); 2596 testTsvSample(["test-bb6", "-v", "333", "-p", "0.03", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P03[1..$]); 2597 testTsvSample(["test-bb7", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1..$]); 2598 2599 /* Distinct sampling cases. */ 2600 testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1..$]); 2601 testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]); 2602 testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]); 2603 testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]); 2604 2605 /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */ 2606 testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..$]); 2607 testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1..$]); 2608 testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader], 2609 data3x6ExpectedDistinctK1K3P60Probs[1..$]); 2610 testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader], 2611 data3x6ExpectedDistinctK2P2ProbsInorder[1..$]); 2612 2613 /* Simple random sampling with replacement. */ 2614 testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty); 2615 testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty); 2616 testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplaceNum3[1..$]); 2617 testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10[1..$]); 2618 testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1..$]); 2619 2620 /* Multi-file tests. */ 2621 testTsvSample(["test-c1", "--header", "--static-seed", "--compatibility-mode", 2622 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2623 combo1ExpectedPermuteCompat); 2624 testTsvSample(["test-c2", "--header", "--static-seed", "--print-random", 2625 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2626 combo1ExpectedPermuteCompatProbs); 2627 testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3", 2628 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2629 combo1ExpectedPermuteWt3Probs); 2630 testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", "--compatibility-mode", 2631 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2632 combo1ExpectedPermuteWt3); 2633 testTsvSample(["test-c5", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", 2634 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2635 combo1ExpectedPermuteAlgoRNum4); 2636 2637 /* Multi-file, no headers. */ 2638 testTsvSample(["test-c6", "--static-seed", "--compatibility-mode", 2639 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2640 fpath_data3x6_noheader, fpath_data3x2_noheader], 2641 combo1ExpectedPermuteCompat[1..$]); 2642 testTsvSample(["test-c7", "--static-seed", "--print-random", 2643 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2644 fpath_data3x6_noheader, fpath_data3x2_noheader], 2645 combo1ExpectedPermuteCompatProbs[1..$]); 2646 testTsvSample(["test-c8", "--static-seed", "--print-random", "--weight-field", "3", 2647 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2648 fpath_data3x6_noheader, fpath_data3x2_noheader], 2649 combo1ExpectedPermuteWt3Probs[1..$]); 2650 testTsvSample(["test-c9", "--static-seed", "--weight-field", "3", "--compatibility-mode", 2651 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2652 fpath_data3x6_noheader, fpath_data3x2_noheader], 2653 combo1ExpectedPermuteWt3[1..$]); 2654 testTsvSample(["test-c10", "--static-seed", "--prefer-algorithm-r", "--num", "4", 2655 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2656 fpath_data3x6_noheader, fpath_data3x2_noheader], 2657 combo1ExpectedPermuteAlgoRNum4[1..$]); 2658 2659 /* Bernoulli sampling cases. */ 2660 testTsvSample(["test-c11", "--header", "--static-seed", "--print-random", "--prob", ".5", 2661 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2662 combo1ExpectedBernoulliCompatP50Probs); 2663 testTsvSample(["test-c12", "--header", "--static-seed", "--prob", ".4", 2664 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2665 combo1ExpectedBernoulliCompatP40); 2666 testTsvSample(["test-c13", "--static-seed", "--print-random", "--prob", ".5", 2667 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2668 fpath_data3x6_noheader, fpath_data3x2_noheader], 2669 combo1ExpectedBernoulliCompatP50Probs[1..$]); 2670 testTsvSample(["test-c14", "--static-seed", "--prob", ".4", 2671 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2672 fpath_data3x6_noheader, fpath_data3x2_noheader], 2673 combo1ExpectedBernoulliCompatP40[1..$]); 2674 2675 /* Bernoulli sampling with probabilities in skip sampling range. */ 2676 testTsvSample(["test-cc1", "-H", "-v", "333", "-p", "0.03", 2677 fpath_data3x0, fpath_data3x1, fpath_data1x200, fpath_dataEmpty, fpath_data1x10], 2678 combo2ExpectedBernoulliSkipV333P03); 2679 testTsvSample(["test-cc1", "-v", "333", "-p", "0.03", 2680 fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader], 2681 combo2ExpectedBernoulliSkipV333P03[1..$]); 2682 2683 /* Distinct sampling cases. */ 2684 testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4", 2685 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2686 combo1ExpectedDistinctK1P40); 2687 testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--prob", ".4", 2688 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2689 fpath_data3x6_noheader, fpath_data3x2_noheader], 2690 combo1ExpectedDistinctK1P40[1..$]); 2691 2692 /* Generating random weights. */ 2693 testTsvSample(["test-c15", "--header", "--static-seed", "--gen-random-inorder", 2694 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2695 combo1ExpectedProbsInorder); 2696 testTsvSample(["test-c16", "--static-seed", "--gen-random-inorder", 2697 fpath_data3x3_noheader, fpath_data3x1_noheader, 2698 fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader], 2699 combo1ExpectedProbsInorder[1..$]); 2700 2701 /* Simple random sampling with replacement. */ 2702 testTsvSample(["test-c17", "--header", "--static-seed", "--replace", "--num", "10", 2703 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2704 combo1ExpectedReplaceNum10); 2705 2706 testTsvSample(["test-c18", "--static-seed", "--replace", "--num", "10", 2707 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2708 fpath_data3x6_noheader, fpath_data3x2_noheader], 2709 combo1ExpectedReplaceNum10[1..$]); 2710 2711 /* Single column file. */ 2712 testTsvSample(["test-d1", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat); 2713 testTsvSample(["test-d2", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat); 2714 2715 /* Distributions. */ 2716 testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs); 2717 testTsvSample(["test-e2", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedPermuteWt2Probs); 2718 testTsvSample(["test-e3", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedPermuteWt2Probs); 2719 testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs); 2720 testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs); 2721 2722 /* Tests of subset sample (--n|num) field. 2723 * 2724 * Note: The way these tests are done ensures that subset length does not affect 2725 * output order. 2726 */ 2727 import std.algorithm : min; 2728 for (size_t n = data3x6.length + 2; n >= 1; n--) 2729 { 2730 /* reservoirSamplingViaHeap. 2731 */ 2732 size_t expectedLength = min(data3x6.length, n + 1); 2733 testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string, 2734 "-H", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]); 2735 2736 testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string, 2737 "-H", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]); 2738 2739 testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string, 2740 "-H", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs[0..expectedLength]); 2741 2742 testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string, 2743 "-H", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3[0..expectedLength]); 2744 2745 testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string, 2746 "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs[0..expectedLength]); 2747 2748 testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string, 2749 fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..expectedLength]); 2750 2751 testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string, 2752 "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..expectedLength]); 2753 2754 testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string, 2755 "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..expectedLength]); 2756 2757 testTsvSample([format("test-f9_%d", n), "-s", "-n", n.to!string, 2758 "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..expectedLength]); 2759 2760 /* Bernoulli sampling. 2761 */ 2762 import std.algorithm : min; 2763 size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedBernoulliCompatProbsP60.length); 2764 2765 testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 2766 "-H", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60[0..sampleExpectedLength]); 2767 2768 testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 2769 "-H", fpath_data3x6], data3x6ExpectedBernoulliCompatP60[0..sampleExpectedLength]); 2770 2771 testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 2772 "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..sampleExpectedLength]); 2773 2774 testTsvSample([format("test-f13_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 2775 fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60[1..sampleExpectedLength]); 2776 2777 /* Distinct Sampling. 2778 */ 2779 size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctK1K3P60.length); 2780 2781 testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, 2782 "-H", fpath_data3x6], data3x6ExpectedDistinctK1K3P60[0..distinctExpectedLength]); 2783 2784 testTsvSample([format("test-f15_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, 2785 fpath_data3x6_noheader], data3x6ExpectedDistinctK1K3P60[1..distinctExpectedLength]); 2786 2787 testTsvSample([format("test-f16_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, 2788 "-H", fpath_data3x6], data3x6ExpectedBernoulliProbsP100[0..expectedLength]); 2789 2790 testTsvSample([format("test-f17_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, 2791 fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]); 2792 } 2793 2794 /* Similar tests with the 1x10 data set. */ 2795 for (size_t n = data1x10.length + 2; n >= 1; n--) 2796 { 2797 size_t expectedLength = min(data1x10.length, n + 1); 2798 testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string, 2799 "-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]); 2800 2801 testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string, 2802 "-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]); 2803 2804 testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string, 2805 fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]); 2806 2807 testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string, 2808 "-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]); 2809 } 2810 2811 /* Simple random sampling with replacement: ensure sample size doesn't change order. */ 2812 for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--) 2813 { 2814 testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6], 2815 data3x6ExpectedReplaceNum10[0 .. n + 1]); 2816 2817 testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader], 2818 data3x6ExpectedReplaceNum10[1 .. n + 1]); 2819 } 2820 2821 /* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */ 2822 for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--) 2823 { 2824 size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1); 2825 2826 testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string, 2827 "-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]); 2828 2829 testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string, 2830 fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]); 2831 } 2832 2833 2834 /* Distinct sampling tests. */ 2835 testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25], 2836 data5x25ExpectedDistinctK2P40); 2837 2838 testTsvSample(["test-j2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25], 2839 data5x25ExpectedDistinctK2K4P20); 2840 2841 testTsvSample(["test-j3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25], 2842 data5x25ExpectedDistinctK2K3K4P20); 2843 2844 testTsvSample(["test-j4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader], 2845 data5x25ExpectedDistinctK2P40[1..$]); 2846 2847 testTsvSample(["test-j5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader], 2848 data5x25ExpectedDistinctK2K4P20[1..$]); 2849 2850 testTsvSample(["test-j6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader], 2851 data5x25ExpectedDistinctK2K3K4P20[1..$]); 2852 2853 2854 /* These distinct tests check that the whole line as '-k 0' and specifying all fields 2855 * in order have the same result. Also that field numbers don't matter, as '-k 1,2' 2856 * in data2x25 are the same keys as '-k 2,4' in data5x25. 2857 */ 2858 testTsvSample(["test-j7", "-H", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25], 2859 data2x25ExpectedDistinctK1K2P20); 2860 2861 testTsvSample(["test-j8", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data2x25], 2862 data2x25ExpectedDistinctK1K2P20); 2863 2864 testTsvSample(["test-j9", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25_noheader], 2865 data2x25ExpectedDistinctK1K2P20[1..$]); 2866 2867 testTsvSample(["test-j10", "-s", "-p", "0.20", "-k", "0", fpath_data2x25_noheader], 2868 data2x25ExpectedDistinctK1K2P20[1..$]); 2869 2870 /* Similar to the last set, but for a 1-column file. Also with random value printing. */ 2871 testTsvSample(["test-j11", "-H", "-s", "-p", "0.20", "-k", "1", fpath_data1x25], 2872 data1x25ExpectedDistinctK1P20); 2873 2874 testTsvSample(["test-j12", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data1x25], 2875 data1x25ExpectedDistinctK1P20); 2876 2877 testTsvSample(["test-j13", "-s", "-p", "0.20", "-k", "1", fpath_data1x25_noheader], 2878 data1x25ExpectedDistinctK1P20[1..$]); 2879 2880 testTsvSample(["test-j14", "-s", "-p", "0.20", "-k", "0", fpath_data1x25_noheader], 2881 data1x25ExpectedDistinctK1P20[1..$]); 2882 2883 2884 testTsvSample(["test-j15", "-H", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25], 2885 data1x25ExpectedDistinctK1P20Probs); 2886 2887 testTsvSample(["test-j16", "-H", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25], 2888 data1x25ExpectedDistinctK1P20Probs); 2889 2890 testTsvSample(["test-j17", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25_noheader], 2891 data1x25ExpectedDistinctK1P20Probs[1..$]); 2892 2893 testTsvSample(["test-j18", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25_noheader], 2894 data1x25ExpectedDistinctK1P20Probs[1..$]); 2895 2896 2897 testTsvSample(["test-j19", "-H", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25], 2898 data1x25ExpectedDistinctK1P20ProbsInorder); 2899 2900 testTsvSample(["test-j20", "-H", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25], 2901 data1x25ExpectedDistinctK1P20ProbsInorder); 2902 2903 testTsvSample(["test-j21", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25_noheader], 2904 data1x25ExpectedDistinctK1P20ProbsInorder[1..$]); 2905 2906 testTsvSample(["test-j22", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25_noheader], 2907 data1x25ExpectedDistinctK1P20ProbsInorder[1..$]); 2908 2909 }