1 /** 2 Command line tool for shuffling or sampling lines from input streams. Several methods 3 are available, including weighted and unweighted shuffling, simple and weighted random 4 sampling, sampling with replacement, Bernoulli sampling, and distinct sampling. 5 6 Copyright (c) 2017-2020, eBay Inc. 7 Initially written by Jon Degenhardt 8 9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 10 */ 11 module tsv_utils.tsv_sample; 12 13 import std.array : appender, Appender, RefAppender; 14 import std.exception : enforce; 15 import std.format : format; 16 import std.range; 17 import std.stdio; 18 import std.typecons : tuple, Flag; 19 20 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 21 22 version(unittest) 23 { 24 // When running unit tests, use main from -main compiler switch. 25 } 26 else 27 { 28 /** Main program. 29 * 30 * Invokes command line argument processing and calls tsvSample to do the real 31 * work. Errors occurring during processing are caught and reported to the user. 32 */ 33 int main(string[] cmdArgs) 34 { 35 /* When running in DMD code coverage mode, turn on report merging. */ 36 version(D_Coverage) version(DigitalMars) 37 { 38 import core.runtime : dmd_coverSetMerge; 39 dmd_coverSetMerge(true); 40 } 41 42 TsvSampleOptions cmdopt; 43 const r = cmdopt.processArgs(cmdArgs); 44 if (!r[0]) return r[1]; 45 version(LDC_Profile) 46 { 47 import ldc.profile : resetAll; 48 resetAll(); 49 } 50 try 51 { 52 import tsv_utils.common.utils : BufferedOutputRange; 53 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 54 55 tsvSample(cmdopt, bufferedOutput); 56 } 57 catch (Exception exc) 58 { 59 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 60 return 1; 61 } 62 return 0; 63 } 64 } 65 66 immutable helpText = q"EOS 67 Synopsis: tsv-sample [options] [file...] 68 69 Sample input lines or randomize their order. Several modes of operation 70 are available: 71 * Shuffling (the default): All input lines are output in random order. All 72 orderings are equally likely. 73 * Random sampling (--n|num N): A random sample of N lines are selected and 74 written to standard output. By default, selected lines are written in 75 random order. All sample sets and orderings are equally likely. Use 76 --i|inorder to write the selected lines in the original input order. 77 * Weighted random sampling (--n|num N, --w|weight-field F): A weighted 78 sample of N lines is produced. Weights are taken from field F. Lines are 79 output in weighted selection order. Use --i|inorder to write in original 80 input order. Omit --n|num to shuffle all lines (weighted shuffling). 81 * Sampling with replacement (--r|replace, --n|num N): All input lines are 82 read in, then lines are repeatedly selected at random and written out. 83 This continues until N lines are output. Individual lines can be written 84 multiple times. Output continues forever if N is zero or not provided. 85 * Bernoulli sampling (--p|prob P): A random subset of lines is selected 86 based on probability P, a 0.0-1.0 value. This is a streaming operation. 87 A decision is made on each line as it is read. Line order is not changed. 88 * Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled 89 based on the values in the key fields. A subset of keys are chosen based 90 on the inclusion probability (a 'distinct' set of keys). All lines with 91 one of the selected keys are output. Line order is not changed. 92 93 Fields are specified using field number or field name. Field names require 94 that the input file has a header line. 95 96 Use '--help-verbose' for detailed information. 97 98 Options: 99 EOS"; 100 101 immutable helpTextVerbose = q"EOS 102 Synopsis: tsv-sample [options] [file...] 103 104 Sample input lines or randomize their order. Several modes of operation 105 are available: 106 * Shuffling (the default): All input lines are output in random order. All 107 orderings are equally likely. 108 * Random sampling (--n|num N): A random sample of N lines are selected and 109 written to standard output. By default, selected lines are written in 110 random order. All sample sets and orderings are equally likely. Use 111 --i|inorder to write the selected lines in the original input order. 112 * Weighted random sampling (--n|num N, --w|weight-field F): A weighted 113 sample of N lines is produced. Weights are taken from field F. Lines are 114 output in weighted selection order. Use --i|inorder to write in original 115 input order. Omit --n|num to shuffle all lines (weighted shuffling). 116 * Sampling with replacement (--r|replace, --n|num N): All input lines are 117 read in, then lines are repeatedly selected at random and written out. 118 This continues until N lines are output. Individual lines can be written 119 multiple times. Output continues forever if N is zero or not provided. 120 * Bernoulli sampling (--p|prob P): A random subset of lines is selected 121 based on probability P, a 0.0-1.0 value. This is a streaming operation. 122 A decision is made on each line as it is read. Line order is not changed. 123 * Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled 124 based on the values in the key fields. A subset of keys are chosen based 125 on the inclusion probability (a 'distinct' set of keys). All lines with 126 one of the selected keys are output. Line order is not changed. 127 128 Fields: Fields are specified by field number or name. Field names require 129 the input file to have a header line. Use '--help-fields' for details. 130 131 Sample size: The '--n|num' option controls the sample size for all 132 sampling methods. In the case of simple and weighted random sampling it 133 also limits the amount of memory required. 134 135 Controlling the random seed: By default, each run produces a different 136 randomization or sampling. Using '--s|static-seed' changes this so 137 multiple runs produce the same results. This works by using the same 138 random seed each run. The random seed can be specified using 139 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero 140 value is a no-op and ignored.) 141 142 Memory use: Bernoulli sampling and distinct sampling make decisions on 143 each line as it is read, there is no memory accumulation. These algorithms 144 can run on arbitrary size inputs. Sampling with replacement reads all 145 lines into memory and is limited by available memory. Shuffling also reads 146 all lines into memory and is similarly limited. Random sampling uses 147 reservoir sampling, and only needs to hold the sample size (--n|num) in 148 memory. The input data can be of any length. 149 150 Weighted sampling: Weighted random sampling is done using an algorithm 151 described by Pavlos Efraimidis and Paul Spirakis. Weights should be 152 positive values representing the relative weight of the entry in the 153 collection. Counts and similar can be used as weights, it is *not* 154 necessary to normalize to a [0,1] interval. Negative values are not 155 meaningful and given the value zero. Input order is not retained, instead 156 lines are output ordered by the randomized weight that was assigned. This 157 means that a smaller valid sample can be produced by taking the first N 158 lines of output. For more info on the sampling approach see: 159 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling 160 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis 161 (https://arxiv.org/abs/1012.0256) 162 163 Printing random values: Most of the sampling algorithms work by generating 164 a random value for each line. (See "Compatibility mode" below.) The nature 165 of these values depends on the sampling algorithm. They are used for both 166 line selection and output ordering. The '--p|print-random' option can be 167 used to print these values. The random value is prepended to the line 168 separated by the --d|delimiter char (TAB by default). The 169 '--gen-random-inorder' option takes this one step further, generating 170 random values for all input lines without changing the input order. The 171 types of values currently used by these sampling algorithms: 172 * Unweighted sampling: Uniform random value in the interval [0,1]. This 173 includes Bernoulli sampling and unweighted line order randomization. 174 * Weighted sampling: Value in the interval [0,1]. Distribution depends on 175 the values in the weight field. It is used as a partial ordering. 176 * Distinct sampling: An integer, zero and up, representing a selection 177 group. The inclusion probability determines the number of selection groups. 178 * Sampling with replacement: Random value printing is not supported. 179 180 The specifics behind these random values are subject to change in future 181 releases. 182 183 Compatibility mode: As described above, many of the sampling algorithms 184 assign a random value to each line. This is useful when printing random 185 values. It has another occasionally useful property: repeated runs with 186 the same static seed but different selection parameters are more 187 compatible with each other, as each line gets assigned the same random 188 value on every run. For example, if Bernoulli sampling is run with 189 '--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed', 190 all the lines selected in the first run will be selected in the second. 191 This comes at a cost: in some cases there are faster algorithms that don't 192 preserve this property. By default, tsv-sample will use faster algorithms 193 when available. However, the '--compatibility-mode' option switches to 194 algorithms that assign a random value per line. Printing random values 195 also engages compatibility mode. 196 197 Options: 198 EOS"; 199 200 /** Container for command line options and derived data. 201 * 202 * TsvSampleOptions handles several aspects of command line options. On the input side, 203 * it defines the command line options available, performs validation, and sets up any 204 * derived state based on the options provided. These activities are handled by the 205 * processArgs() member. 206 * 207 * Once argument processing is complete, TsvSampleOptions is used as a container 208 * holding the specific processing options used by the different sampling routines. 209 */ 210 struct TsvSampleOptions 211 { 212 import tsv_utils.common.utils : InputSourceRange; 213 214 string programName; /// Program name 215 InputSourceRange inputSources; /// Input files 216 bool hasHeader = false; /// --H|header 217 ulong sampleSize = 0; /// --n|num - Size of the desired sample 218 double inclusionProbability = double.nan; /// --p|prob - Inclusion probability 219 size_t[] keyFields; /// Derived: --k|key-fields - Used with inclusion probability 220 size_t weightField = 0; /// Derived: --w|weight-field - Field holding the weight 221 bool srsWithReplacement = false; /// --r|replace 222 bool preserveInputOrder = false; /// --i|inorder 223 bool staticSeed = false; /// --s|static-seed 224 uint seedValueOptionArg = 0; /// --v|seed-value 225 bool printRandom = false; /// --print-random 226 bool genRandomInorder = false; /// --gen-random-inorder 227 string randomValueHeader = "random_value"; /// --random-value-header 228 bool compatibilityMode = false; /// --compatibility-mode 229 char delim = '\t'; /// --d|delimiter 230 bool preferSkipSampling = false; /// --prefer-skip-sampling 231 bool preferAlgorithmR = false; /// --prefer-algorithm-r 232 bool hasWeightField = false; /// Derived. 233 bool useBernoulliSampling = false; /// Derived. 234 bool useDistinctSampling = false; /// Derived. 235 bool distinctKeyIsFullLine = false; /// Derived. True if '--k|key-fields 0' is specfied. 236 bool usingUnpredictableSeed = true; /// Derived from --static-seed, --seed-value 237 uint seed = 0; /// Derived from --static-seed, --seed-value 238 239 /** Process tsv-sample command line arguments. 240 * 241 * Defines the command line options, performs validation, and derives additional 242 * state. std.getopt.getopt is called to do the main option processing followed 243 * additional validation and derivation. 244 * 245 * Help text is printed to standard output if help was requested. Error text is 246 * written to stderr if invalid input is encountered. 247 * 248 * A tuple is returned. First value is true if command line arguments were 249 * successfully processed and execution should continue, or false if an error 250 * occurred or the user asked for help. If false, the second value is the 251 * appropriate exit code (0 or 1). 252 * 253 * Returning true (execution continues) means args have been validated and derived 254 * values calculated. Field indices will have been converted to zero-based. 255 */ 256 auto processArgs(ref string[] cmdArgs) 257 { 258 import std.algorithm : all, canFind, each; 259 import std.conv : to; 260 import std.getopt; 261 import std.math : isNaN; 262 import std.path : baseName, stripExtension; 263 import std.typecons : Yes, No; 264 import tsv_utils.common.utils : inputSourceRange, ReadHeader, throwIfWindowsNewlineOnUnix; 265 import tsv_utils.common.fieldlist; 266 267 bool helpVerbose = false; // --help-verbose 268 bool helpFields = false; // --help-fields 269 bool versionWanted = false; // --V|version 270 string keyFieldsArg; // --k|key-fields 271 string weightFieldArg; // --w|weight-field 272 273 string keyFieldsOptionString = "k|key-fields"; 274 string weightFieldOptionString = "w|weight-field"; 275 276 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 277 278 try 279 { 280 arraySep = ","; // Use comma to separate values in command line options 281 auto r = getopt( 282 cmdArgs, 283 "help-verbose", " Print more detailed help.", &helpVerbose, 284 "help-fields", " Print help on specifying fields.", &helpFields, 285 286 std.getopt.config.caseSensitive, 287 "H|header", " Treat the first line of each file as a header.", &hasHeader, 288 std.getopt.config.caseInsensitive, 289 290 "n|num", "NUM Maximum number of lines to output. All selected lines are output if not provided or zero.", &sampleSize, 291 "p|prob", "NUM Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability, 292 293 keyFieldsOptionString, 294 "<field-list> Fields to use as key for distinct sampling. Use with '--p|prob'. Specify '--k|key-fields 0' to use the entire line as the key.", 295 &keyFieldsArg, 296 297 weightFieldOptionString, 298 "NUM Field containing weights. All lines get equal weight if not provided.", 299 &weightFieldArg, 300 301 "r|replace", " Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement, 302 "i|inorder", " Output random samples in original input order. Requires use of --n|num.", &preserveInputOrder, 303 "s|static-seed", " Use the same random seed every run.", &staticSeed, 304 305 std.getopt.config.caseSensitive, 306 "v|seed-value", "NUM Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg, 307 std.getopt.config.caseInsensitive, 308 309 "print-random", " Include the assigned random value (prepended) when writing output lines.", &printRandom, 310 "gen-random-inorder", " Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder, 311 "random-value-header", " Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader, 312 "compatibility-mode", " Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode, 313 314 "d|delimiter", "CHR Field delimiter.", &delim, 315 316 std.getopt.config.caseSensitive, 317 "V|version", " Print version information and exit.", &versionWanted, 318 std.getopt.config.caseInsensitive, 319 320 "prefer-skip-sampling", " (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.", 321 &preferSkipSampling, 322 323 "prefer-algorithm-r", " (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.", 324 &preferAlgorithmR, 325 ); 326 327 if (r.helpWanted) 328 { 329 defaultGetoptPrinter(helpText, r.options); 330 return tuple(false, 0); 331 } 332 else if (helpVerbose) 333 { 334 defaultGetoptPrinter(helpTextVerbose, r.options); 335 return tuple(false, 0); 336 } 337 else if (helpFields) 338 { 339 writeln(fieldListHelpText); 340 return tuple(false, 0); 341 } 342 else if (versionWanted) 343 { 344 import tsv_utils.common.tsvutils_version; 345 writeln(tsvutilsVersionNotice("tsv-sample")); 346 return tuple(false, 0); 347 } 348 349 /* Input files. Remaining command line args are files. */ 350 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 351 cmdArgs.length = 1; 352 353 /* Validation and derivations - Do as much validation prior to header line 354 * processing as possible (avoids waiting on stdin). 355 * 356 * Note: keyFields and weightField depend on header line processing, but 357 * keyFieldsArg and weightFieldArg can be used to detect whether the 358 * command line argument was specified. 359 */ 360 361 /* Set hasWeightField here so it can be used in other validation checks. 362 * Field validity checked after reading file header. 363 */ 364 hasWeightField = !weightFieldArg.empty; 365 366 /* Sampling with replacement checks (--r|replace). */ 367 if (srsWithReplacement) 368 { 369 enforce(!hasWeightField, 370 "Sampling with replacement (--r|replace) does not support weights (--w|weight-field)."); 371 372 enforce(inclusionProbability.isNaN, 373 "Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob)."); 374 375 enforce(keyFieldsArg.empty, 376 "Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields)."); 377 378 enforce(!printRandom && !genRandomInorder, 379 "Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder)."); 380 381 enforce(!preserveInputOrder, 382 "Sampling with replacement (--r|replace) does not support input order preservation (--i|inorder option)."); 383 } 384 385 /* Distinct sampling checks (--k|key-fields --p|prob). */ 386 enforce(keyFieldsArg.empty | !inclusionProbability.isNaN, 387 "--p|prob is required when using --k|key-fields."); 388 389 /* Inclusion probability (--p|prob) is used for both Bernoulli sampling 390 * and distinct sampling. 391 */ 392 if (!inclusionProbability.isNaN) 393 { 394 enforce(inclusionProbability > 0.0 && inclusionProbability <= 1.0, 395 format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability)); 396 397 if (!keyFieldsArg.empty) useDistinctSampling = true; 398 else useBernoulliSampling = true; 399 400 enforce(!hasWeightField, "--w|weight-field and --p|prob cannot be used together."); 401 402 enforce(!genRandomInorder || useDistinctSampling, 403 "--gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used." ~ 404 "\nUse --gen-random-inorder alone to print probabilities for all lines." ~ 405 "\nUse --p|prob and --print-random to print probabilities for lines satisfying the probability threshold."); 406 } 407 else if (genRandomInorder && !hasWeightField) 408 { 409 useBernoulliSampling = true; 410 } 411 412 /* randomValueHeader (--random-value-header) validity. Note that 413 randomValueHeader is initialized to a valid, non-empty string. 414 */ 415 enforce(!randomValueHeader.empty && !randomValueHeader.canFind('\n') && 416 !randomValueHeader.canFind(delim), 417 "--randomValueHeader must be at least one character and not contain field delimiters or newlines."); 418 419 /* Check for incompatible use of (--i|inorder) and shuffling of the full 420 * data set. Sampling with replacement is also incompatible, this is 421 * detected earlier. Shuffling is the default operation, so it identified 422 * by eliminating the other modes of operation. 423 */ 424 enforce(!preserveInputOrder || 425 sampleSize != 0 || 426 useBernoulliSampling || 427 useDistinctSampling, 428 "Preserving input order (--i|inorder) is not compatible with full data set shuffling. Switch to random sampling with a sample size (--n|num) to use --i|inorder."); 429 430 431 /* Compatibility mode checks: 432 * - Random value printing implies compatibility-mode, otherwise user's 433 * selection is used. 434 * - Distinct sampling doesn't support compatibility-mode. The routines 435 * don't care, but users might expect larger probabilities to be a 436 * superset of smaller probabilities. This would be confusing, so 437 * flag it as an error. 438 */ 439 enforce(!(compatibilityMode && useDistinctSampling), 440 "Distinct sampling (--k|key-fields --p|prob) does not support --compatibility-mode."); 441 442 if (printRandom || genRandomInorder) compatibilityMode = true; 443 444 445 /* Seed. */ 446 import std.random : unpredictableSeed; 447 448 usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0); 449 450 if (usingUnpredictableSeed) seed = unpredictableSeed; 451 else if (seedValueOptionArg != 0) seed = seedValueOptionArg; 452 else if (staticSeed) seed = 2438424139; 453 else assert(0, "Internal error, invalid seed option states."); 454 455 string[] headerFields; 456 457 /* fieldListArgProcessing encapsulates the field list processing. It is 458 * called prior to reading the header line if headers are not being used, 459 * and after if headers are being used. 460 */ 461 void fieldListArgProcessing() 462 { 463 if (!weightFieldArg.empty) 464 { 465 auto fieldIndices = 466 weightFieldArg 467 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero) 468 (hasHeader, headerFields, weightFieldOptionString) 469 .array; 470 471 enforce(fieldIndices.length == 1, 472 format("'--%s' must be a single field.", weightFieldOptionString)); 473 474 weightField = fieldIndices[0]; 475 } 476 477 if (!keyFieldsArg.empty) 478 { 479 keyFields = 480 keyFieldsArg 481 .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero) 482 (hasHeader, headerFields, keyFieldsOptionString) 483 .array; 484 485 assert(keyFields.length > 0); 486 487 if (keyFields.length > 0) 488 { 489 if (keyFields.length == 1 && keyFields[0] == 0) 490 { 491 distinctKeyIsFullLine = true; 492 } 493 else 494 { 495 enforce(keyFields.length <= 1 || keyFields.all!(x => x != 0), 496 "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields."); 497 498 keyFields.each!((ref x) => --x); // Convert to zero-based indexing. 499 } 500 } 501 } 502 } 503 504 if (!hasHeader) fieldListArgProcessing(); 505 506 /* 507 * Create the inputSourceRange and perform header line processing. 508 */ 509 ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader; 510 inputSources = inputSourceRange(filepaths, readHeader); 511 512 if (hasHeader) 513 { 514 throwIfWindowsNewlineOnUnix(inputSources.front.header, inputSources.front.name, 1); 515 headerFields = inputSources.front.header.split(delim).to!(string[]); 516 fieldListArgProcessing(); 517 } 518 519 } 520 catch (Exception exc) 521 { 522 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 523 return tuple(false, 1); 524 } 525 return tuple(true, 0); 526 } 527 } 528 /** Invokes the appropriate sampling routine based on the command line arguments. 529 * 530 * tsvSample is the top-level routine handling the different tsv-sample use cases. 531 * Its primary role is to invoke the correct routine for type of sampling requested. 532 */ 533 void tsvSample(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 534 if (isOutputRange!(OutputRange, char)) 535 { 536 if (cmdopt.srsWithReplacement) 537 { 538 simpleRandomSamplingWithReplacement(cmdopt, outputStream); 539 } 540 else if (cmdopt.useBernoulliSampling) 541 { 542 bernoulliSamplingCommand(cmdopt, outputStream); 543 } 544 else if (cmdopt.useDistinctSampling) 545 { 546 if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 547 else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream); 548 } 549 else if (cmdopt.genRandomInorder) 550 { 551 /* Note that the preceding cases handle gen-random-inorder themselves (Bernoulli, 552 * Distinct), or don't handle it (SRS w/ Replacement). 553 */ 554 assert(cmdopt.hasWeightField); 555 generateWeightedRandomValuesInorder(cmdopt, outputStream); 556 } 557 else if (cmdopt.sampleSize != 0) 558 { 559 randomSamplingCommand(cmdopt, outputStream); 560 } 561 else 562 { 563 shuffleCommand(cmdopt, outputStream); 564 } 565 } 566 567 /** Bernoulli sampling command handler. Invokes the appropriate Bernoulli sampling 568 * routine based on the command line arguments. 569 * 570 * This routine selects the appropriate Bernoulli sampling function and template 571 * instantiation to use based on the command line arguments. 572 * 573 * One of the basic choices is whether to use the vanilla algorithm or skip sampling. 574 * Skip sampling is a little bit faster when the inclusion probability is small but 575 * doesn't support compatibility mode. See the bernoulliSkipSampling documentation 576 * for a discussion of the skipSamplingProbabilityThreshold used here. 577 */ 578 void bernoulliSamplingCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 579 if (isOutputRange!(OutputRange, char)) 580 { 581 assert(!cmdopt.hasWeightField); 582 583 immutable double skipSamplingProbabilityThreshold = 0.04; 584 585 if (cmdopt.compatibilityMode || 586 (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling)) 587 { 588 if (cmdopt.genRandomInorder) 589 { 590 bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 591 } 592 else 593 { 594 bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream); 595 } 596 } 597 else 598 { 599 bernoulliSkipSampling(cmdopt, outputStream); 600 } 601 } 602 603 /** Bernoulli sampling of lines from the input stream. 604 * 605 * Each input line is a assigned a random value and output if less than 606 * cmdopt.inclusionProbability. The order of the lines is not changed. 607 * 608 * This routine supports random value printing and gen-random-inorder value printing. 609 */ 610 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) 611 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 612 if (isOutputRange!(OutputRange, char)) 613 { 614 import std.random : Random = Mt19937, uniform01; 615 import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, 616 InputSourceRange, throwIfWindowsNewlineOnUnix; 617 618 static if (generateRandomAll) assert(cmdopt.genRandomInorder); 619 else assert(!cmdopt.genRandomInorder); 620 621 assert(!cmdopt.inputSources.empty); 622 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 623 624 auto randomGenerator = Random(cmdopt.seed); 625 626 /* First header is read during command line argument processing. */ 627 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 628 { 629 auto inputStream = cmdopt.inputSources.front; 630 631 static if (generateRandomAll) 632 { 633 outputStream.put(cmdopt.randomValueHeader); 634 outputStream.put(cmdopt.delim); 635 } 636 else if (cmdopt.printRandom) 637 { 638 outputStream.put(cmdopt.randomValueHeader); 639 outputStream.put(cmdopt.delim); 640 } 641 642 outputStream.put(inputStream.header); 643 outputStream.put("\n"); 644 645 /* Immediately flush the header so subsequent processes in a unix command 646 * pipeline see it early. This helps provide timely error messages. 647 */ 648 static if (isFlushableOutputRange!OutputRange) outputStream.flush; 649 } 650 651 /* Process each line. */ 652 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 653 ulong numLinesWritten = 0; 654 655 foreach (inputStream; cmdopt.inputSources) 656 { 657 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 658 659 foreach (ulong fileLineNum, line; 660 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) 661 { 662 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); 663 664 immutable double lineScore = uniform01(randomGenerator); 665 666 static if (generateRandomAll) 667 { 668 outputStream.formatRandomValue(lineScore); 669 outputStream.put(cmdopt.delim); 670 outputStream.put(line); 671 outputStream.put("\n"); 672 673 if (cmdopt.sampleSize != 0) 674 { 675 ++numLinesWritten; 676 if (numLinesWritten == cmdopt.sampleSize) return; 677 } 678 } 679 else if (lineScore < cmdopt.inclusionProbability) 680 { 681 if (cmdopt.printRandom) 682 { 683 outputStream.formatRandomValue(lineScore); 684 outputStream.put(cmdopt.delim); 685 } 686 outputStream.put(line); 687 outputStream.put("\n"); 688 689 if (cmdopt.sampleSize != 0) 690 { 691 ++numLinesWritten; 692 if (numLinesWritten == cmdopt.sampleSize) return; 693 } 694 } 695 } 696 } 697 } 698 699 /** bernoulliSkipSampling is an implementation of Bernoulli sampling using skips. 700 * 701 * Skip sampling works by skipping a random number of lines between selections. This 702 * can be faster than assigning a random value to each line when the inclusion 703 * probability is low, as it reduces the number of calls to the random number 704 * generator. Both the random number generator and the log() function are called when 705 * calculating the next skip size. These additional log() calls add up as the 706 * inclusion probability increases. 707 * 708 * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for 709 * file-oriented line sampling. This is obviously environment specific. In the 710 * environments this implementation has been tested in the performance improvements 711 * remain small, less than 7%, even with an inclusion probability as low as 0.0001. 712 * 713 * The algorithm does not assign random values to individual lines. This makes it 714 * incompatible with random value printing. It is not suitable for compatibility mode 715 * either. As an example, in compatibility mode a line selected with '--prob 0.2' should 716 * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling 717 * does not have this property. 718 * 719 * The algorithm for calculating the skip size has been described by multiple sources. 720 * There are two key variants depending on whether the total number of lines in the 721 * data set is known in advance. (This implementation does not know the total.) 722 * Useful references: 723 * $(LIST 724 * * Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling", 725 * ACM Trans on Mathematical Software, 1987. On-line: 726 * http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf 727 * * P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book 728 * "Data Stream Management", Springer-Verlag, 2016. On-line: 729 * https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf 730 * * Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line: 731 * http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/ 732 * ) 733 */ 734 void bernoulliSkipSampling(OutputRange)(ref TsvSampleOptions cmdopt, OutputRange outputStream) 735 if (isOutputRange!(OutputRange, char)) 736 { 737 import std.conv : to; 738 import std.math : log, trunc; 739 import std.random : Random = Mt19937, uniform01; 740 import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, 741 InputSourceRange, throwIfWindowsNewlineOnUnix; 742 743 assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0); 744 assert(!cmdopt.printRandom); 745 assert(!cmdopt.compatibilityMode); 746 747 assert(!cmdopt.inputSources.empty); 748 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 749 750 auto randomGenerator = Random(cmdopt.seed); 751 752 immutable double discardRate = 1.0 - cmdopt.inclusionProbability; 753 immutable double logDiscardRate = log(discardRate); 754 755 /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed 756 * interval to (0.0, 1.0], excluding 0.0. 757 */ 758 size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t; 759 760 /* First header is read during command line argument processing. */ 761 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 762 { 763 auto inputStream = cmdopt.inputSources.front; 764 765 outputStream.put(inputStream.header); 766 outputStream.put("\n"); 767 768 /* Immediately flush the header so subsequent processes in a unix command 769 * pipeline see it early. This helps provide timely error messages. 770 */ 771 static if (isFlushableOutputRange!OutputRange) outputStream.flush; 772 } 773 774 /* Process each line. */ 775 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 776 ulong numLinesWritten = 0; 777 foreach (inputStream; cmdopt.inputSources) 778 { 779 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 780 781 foreach (ulong fileLineNum, line; 782 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) 783 { 784 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); 785 786 if (remainingSkips > 0) 787 { 788 --remainingSkips; 789 } 790 else 791 { 792 outputStream.put(line); 793 outputStream.put("\n"); 794 795 if (cmdopt.sampleSize != 0) 796 { 797 ++numLinesWritten; 798 if (numLinesWritten == cmdopt.sampleSize) return; 799 } 800 801 remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t; 802 } 803 } 804 } 805 } 806 807 /** Sample lines by choosing a random set of distinct keys formed from one or more 808 * fields on each line. 809 * 810 * Distinct sampling is a streaming form of sampling, similar to Bernoulli sampling. 811 * However, instead of each line being subject to an independent trial, lines are 812 * selected based on a key from each line. A portion of keys are randomly selected for 813 * output, and every line containing a selected key is included in the output. 814 * 815 * An example use-case is a query log having <user, query, clicked-url> triples. It is 816 * often useful to sample records for portion of the users, but including all records 817 * for the users selected. Distinct sampling supports this by selecting a subset of 818 * users to include in the output. 819 * 820 * Distinct sampling is done by hashing the key and mapping the hash value into 821 * buckets sized to hold the inclusion probability. Records having a key mapping to 822 * bucket zero are output. Buckets are equal size and therefore may be larger than the 823 * inclusion probability. (The other approach would be to have the caller specify the 824 * the number of buckets. More correct, but less convenient.) 825 */ 826 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) 827 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 828 if (isOutputRange!(OutputRange, char)) 829 { 830 import std.algorithm : splitter; 831 import std.conv : to; 832 import std.digest.murmurhash; 833 import std.math : lrint; 834 import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, 835 InputFieldReordering, InputSourceRange, throwIfWindowsNewlineOnUnix; 836 837 static if (generateRandomAll) assert(cmdopt.genRandomInorder); 838 else assert(!cmdopt.genRandomInorder); 839 840 assert(cmdopt.keyFields.length > 0); 841 assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0); 842 843 assert(!cmdopt.inputSources.empty); 844 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 845 846 static if (generateRandomAll) 847 { 848 import std.format : formatValue, singleSpec; 849 immutable randomValueFormatSpec = singleSpec("%d"); 850 } 851 852 immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys. 853 854 uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint; 855 856 /* Create a mapping for the key fields. */ 857 auto keyFieldsReordering = cmdopt.distinctKeyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields); 858 859 /* First header is read during command line argument processing. */ 860 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 861 { 862 auto inputStream = cmdopt.inputSources.front; 863 864 static if (generateRandomAll) 865 { 866 outputStream.put(cmdopt.randomValueHeader); 867 outputStream.put(cmdopt.delim); 868 } 869 else if (cmdopt.printRandom) 870 { 871 outputStream.put(cmdopt.randomValueHeader); 872 outputStream.put(cmdopt.delim); 873 } 874 875 outputStream.put(inputStream.header); 876 outputStream.put("\n"); 877 878 /* Immediately flush the header so subsequent processes in a unix command 879 * pipeline see it early. This helps provide timely error messages. 880 */ 881 static if (isFlushableOutputRange!OutputRange) outputStream.flush; 882 } 883 884 /* Process each line. */ 885 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 886 ulong numLinesWritten = 0; 887 888 foreach (inputStream; cmdopt.inputSources) 889 { 890 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 891 892 foreach (ulong fileLineNum, line; 893 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) 894 { 895 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); 896 897 /* Murmurhash works by successively adding individual keys, then finalizing. 898 * Adding individual keys is simpler if the full-line-as-key and individual 899 * fields as keys cases are separated. 900 */ 901 auto hasher = MurmurHash3!32(cmdopt.seed); 902 903 if (cmdopt.distinctKeyIsFullLine) 904 { 905 hasher.put(cast(ubyte[]) line); 906 } 907 else 908 { 909 assert(keyFieldsReordering !is null); 910 911 /* Gather the key field values and assemble the key. */ 912 keyFieldsReordering.initNewLine; 913 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 914 { 915 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 916 if (keyFieldsReordering.allFieldsFilled) break; 917 } 918 919 enforce(keyFieldsReordering.allFieldsFilled, 920 format("Not enough fields in line. File: %s, Line: %s", 921 inputStream.name, fileLineNum)); 922 923 foreach (count, key; keyFieldsReordering.outputFields.enumerate) 924 { 925 if (count > 0) hasher.put(delimArray); 926 hasher.put(cast(ubyte[]) key); 927 } 928 } 929 930 hasher.finish; 931 932 static if (generateRandomAll) 933 { 934 import std.conv : to; 935 outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec); 936 outputStream.put(cmdopt.delim); 937 outputStream.put(line); 938 outputStream.put("\n"); 939 940 if (cmdopt.sampleSize != 0) 941 { 942 ++numLinesWritten; 943 if (numLinesWritten == cmdopt.sampleSize) return; 944 } 945 } 946 else if (hasher.get % numBuckets == 0) 947 { 948 if (cmdopt.printRandom) 949 { 950 outputStream.put('0'); 951 outputStream.put(cmdopt.delim); 952 } 953 outputStream.put(line); 954 outputStream.put("\n"); 955 956 if (cmdopt.sampleSize != 0) 957 { 958 ++numLinesWritten; 959 if (numLinesWritten == cmdopt.sampleSize) return; 960 } 961 } 962 } 963 } 964 } 965 966 /** Random sampling command handler. Invokes the appropriate sampling routine based on 967 * the command line arguments. 968 * 969 * Random sampling selects a fixed size random sample from the input stream. Both 970 * simple random sampling (equal likelihood) and weighted random sampling are 971 * supported. Selected lines are output either in random order or original input order. 972 * For weighted sampling the random order is the weighted selection order. 973 * 974 * Two algorithms are used, reservoir sampling via a heap and reservoir sampling via 975 * Algorithm R. This routine selects the appropriate reservoir sampling function and 976 * template instantiation to based on the command line arguments. 977 * 978 * Weighted sampling always uses the heap approach. Compatibility mode does as well, 979 * as it is the method that uses per-line random value assignments. The implication 980 * of compatibility mode is that a larger sample size includes all the results from 981 * a smaller sample, assuming the same random seed is used. 982 * 983 * For unweighted sampling there is a performance tradeoff between implementations. 984 * Heap-based sampling is faster for small sample sizes. Algorithm R is faster for 985 * large sample sizes. The threshold used was chosen based on performance tests. See 986 * the reservoirSamplingAlgorithmR documentation for more information. 987 */ 988 989 void randomSamplingCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 990 if (isOutputRange!(OutputRange, char)) 991 { 992 assert(cmdopt.sampleSize != 0); 993 994 immutable size_t algorithmRSampleSizeThreshold = 128 * 1024; 995 996 if (cmdopt.hasWeightField) 997 { 998 if (cmdopt.preserveInputOrder) 999 { 1000 reservoirSamplingViaHeap!(Yes.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream); 1001 } 1002 else 1003 { 1004 reservoirSamplingViaHeap!(Yes.isWeighted, No.preserveInputOrder)(cmdopt, outputStream); 1005 } 1006 } 1007 else if (cmdopt.compatibilityMode || 1008 (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR)) 1009 { 1010 if (cmdopt.preserveInputOrder) 1011 { 1012 reservoirSamplingViaHeap!(No.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream); 1013 } 1014 else 1015 { 1016 reservoirSamplingViaHeap!(No.isWeighted, No.preserveInputOrder)(cmdopt, outputStream); 1017 } 1018 } 1019 else if (cmdopt.preserveInputOrder) 1020 { 1021 reservoirSamplingAlgorithmR!(Yes.preserveInputOrder)(cmdopt, outputStream); 1022 } 1023 else 1024 { 1025 reservoirSamplingAlgorithmR!(No.preserveInputOrder)(cmdopt, outputStream); 1026 } 1027 } 1028 1029 /** Reservoir sampling using a heap. Both weighted and unweighted random sampling are 1030 * supported. 1031 * 1032 * The algorithm used here is based on the one-pass algorithm described by Pavlos 1033 * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S. 1034 * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are 1035 * simply set to one. 1036 * 1037 * The implementation uses a heap (priority queue) large enough to hold the desired 1038 * number of lines. Input is read line-by-line, assigned a random value, and added to 1039 * the heap. The role of the heap is to identify the lines with the highest assigned 1040 * random values. Once the heap is full, adding a new line means dropping the line with 1041 * the lowest score. A "min" heap used for this reason. 1042 * 1043 * When done reading all lines, the "min" heap is in reverse of weighted selection 1044 * order. Weighted selection order is obtained by removing each element one at at time 1045 * from the heap. The underlying data store will have the elements in weighted selection 1046 * order (largest weights first). 1047 * 1048 * Generating output in weighted order is useful for several reasons: 1049 * - For weighted sampling, it preserves the property that smaller valid subsets can be 1050 * created by taking the first N lines. 1051 * - For unweighted sampling, it ensures that all output permutations are possible, and 1052 * are not influenced by input order or the heap data structure used. 1053 * - Order consistency is maintained when making repeated use of the same random seed, 1054 * but with different sample sizes. 1055 * 1056 * The other choice is preserving input order. This is supporting by recording line 1057 * numbers and sorting the selected sample. 1058 * 1059 * There are use cases where only the selection set matters. For these some performance 1060 * could be gained by skipping the reordering and simply printing the backing store 1061 * array in-order. Performance tests indicate only a minor benefit, so this is not 1062 * supported. 1063 * 1064 * Notes: 1065 * $(LIST 1066 * * In tsv-sample versions 1.2.1 and earlier this routine also supported 1067 * randomization of all input lines. This was dropped in version 1.2.2 in favor 1068 * of the approach used in randomizeLines. The latter has significant advantages 1069 * given that all data must be read into memory. 1070 * * For large reservoir sizes better performance can be achieved using Algorithm R. 1071 * See the reservoirSamplingAlgorithmR documentation for details. 1072 * ) 1073 */ 1074 void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, Flag!"preserveInputOrder" preserveInputOrder, OutputRange) 1075 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1076 if (isOutputRange!(OutputRange, char)) 1077 { 1078 import std.algorithm : sort; 1079 import std.container.array; 1080 import std.container.binaryheap; 1081 import std.meta : AliasSeq; 1082 import std.random : Random = Mt19937, uniform01; 1083 import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, 1084 InputSourceRange, throwIfWindowsNewlineOnUnix; 1085 1086 static if (isWeighted) assert(cmdopt.hasWeightField); 1087 else assert(!cmdopt.hasWeightField); 1088 1089 assert(cmdopt.sampleSize > 0); 1090 1091 assert(!cmdopt.inputSources.empty); 1092 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 1093 1094 auto randomGenerator = Random(cmdopt.seed); 1095 1096 static struct Entry(Flag!"preserveInputOrder" preserveInputOrder) 1097 { 1098 double score; 1099 const(char)[] line; 1100 static if (preserveInputOrder) ulong lineNumber; 1101 } 1102 1103 /* Create the heap and backing data store. 1104 * 1105 * Note: An std.container.array is used as the backing store to avoid some issues in 1106 * the standard library (Phobos) binaryheap implementation. Specifically, when an 1107 * std.container.array is used as backing store, the heap can efficiently reversed by 1108 * removing the heap elements. This leaves the backing store in the reversed order. 1109 * However, the current binaryheap implementation does not support this for all 1110 * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094. 1111 */ 1112 1113 Array!(Entry!preserveInputOrder) dataStore; 1114 dataStore.reserve(cmdopt.sampleSize); 1115 auto reservoir = dataStore.heapify!("a.score > b.score")(0); // Min binaryheap 1116 1117 /* First header is read during command line argument processing. */ 1118 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 1119 { 1120 auto inputStream = cmdopt.inputSources.front; 1121 1122 if (cmdopt.printRandom) 1123 { 1124 outputStream.put(cmdopt.randomValueHeader); 1125 outputStream.put(cmdopt.delim); 1126 } 1127 outputStream.put(inputStream.header); 1128 outputStream.put("\n"); 1129 1130 /* Immediately flush the header so subsequent processes in a unix command 1131 * pipeline see it early. This helps provide timely error messages. 1132 */ 1133 static if (isFlushableOutputRange!OutputRange) outputStream.flush; 1134 } 1135 1136 /* Process each line. */ 1137 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 1138 static if (preserveInputOrder) ulong totalLineNum = 0; 1139 1140 foreach (inputStream; cmdopt.inputSources) 1141 { 1142 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 1143 1144 foreach (ulong fileLineNum, line; 1145 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) 1146 { 1147 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); 1148 1149 static if (!isWeighted) 1150 { 1151 immutable double lineScore = uniform01(randomGenerator); 1152 } 1153 else 1154 { 1155 immutable double lineWeight = 1156 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, inputStream.name, fileLineNum); 1157 immutable double lineScore = 1158 (lineWeight > 0.0) 1159 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1160 : 0.0; 1161 } 1162 1163 static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum); 1164 else alias entryCTArgs = AliasSeq!(); 1165 1166 if (reservoir.length < cmdopt.sampleSize) 1167 { 1168 reservoir.insert(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs)); 1169 } 1170 else if (reservoir.front.score < lineScore) 1171 { 1172 reservoir.replaceFront(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs)); 1173 } 1174 1175 static if (preserveInputOrder) ++totalLineNum; 1176 } 1177 } 1178 1179 /* Done with input, all entries are in the reservoir. */ 1180 1181 /* The asserts here avoid issues with the current binaryheap implementation. They 1182 * detect use of backing stores having a length not synchronized to the reservoir. 1183 */ 1184 immutable ulong numLines = reservoir.length; 1185 assert(numLines == dataStore.length); 1186 1187 /* Update the backing store so it is in the desired output order. 1188 */ 1189 static if (preserveInputOrder) 1190 { 1191 dataStore[].sort!((a, b) => a.lineNumber < b.lineNumber); 1192 } 1193 else 1194 { 1195 /* Output in weighted selection order. The heap is in reverse order of assigned 1196 * weights. Reversing order is done by removing all elements from the heap. This 1197 * leaves the backing store in the correct order. 1198 */ 1199 while (!reservoir.empty) reservoir.removeFront; 1200 } 1201 1202 assert(numLines == dataStore.length); 1203 1204 foreach (entry; dataStore) 1205 { 1206 if (cmdopt.printRandom) 1207 { 1208 outputStream.formatRandomValue(entry.score); 1209 outputStream.put(cmdopt.delim); 1210 } 1211 outputStream.put(entry.line); 1212 outputStream.put("\n"); 1213 } 1214 } 1215 1216 /** Generate weighted random values for all input lines, preserving input order. 1217 * 1218 * This complements weighted reservoir sampling, but instead of using a reservoir it 1219 * simply iterates over the input lines generating the values. The weighted random 1220 * values are generated with the same formula used by reservoirSampling. 1221 */ 1222 void generateWeightedRandomValuesInorder(OutputRange) 1223 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1224 if (isOutputRange!(OutputRange, char)) 1225 { 1226 import std.random : Random = Mt19937, uniform01; 1227 import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, 1228 InputSourceRange, throwIfWindowsNewlineOnUnix; 1229 1230 assert(cmdopt.hasWeightField); 1231 1232 assert(!cmdopt.inputSources.empty); 1233 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 1234 1235 auto randomGenerator = Random(cmdopt.seed); 1236 1237 /* First header is read during command line argument processing. */ 1238 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 1239 { 1240 auto inputStream = cmdopt.inputSources.front; 1241 1242 outputStream.put(cmdopt.randomValueHeader); 1243 outputStream.put(cmdopt.delim); 1244 outputStream.put(inputStream.header); 1245 outputStream.put("\n"); 1246 1247 /* Immediately flush the header so subsequent processes in a unix command 1248 * pipeline see it early. This helps provide timely error messages. 1249 */ 1250 static if (isFlushableOutputRange!OutputRange) outputStream.flush; 1251 } 1252 1253 /* Process each line. */ 1254 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 1255 ulong numLinesWritten = 0; 1256 1257 foreach (inputStream; cmdopt.inputSources) 1258 { 1259 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 1260 1261 foreach (ulong fileLineNum, line; 1262 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) 1263 { 1264 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); 1265 1266 immutable double lineWeight = 1267 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, inputStream.name, fileLineNum); 1268 1269 immutable double lineScore = 1270 (lineWeight > 0.0) 1271 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1272 : 0.0; 1273 1274 outputStream.formatRandomValue(lineScore); 1275 outputStream.put(cmdopt.delim); 1276 outputStream.put(line); 1277 outputStream.put("\n"); 1278 1279 if (cmdopt.sampleSize != 0) 1280 { 1281 ++numLinesWritten; 1282 if (numLinesWritten == cmdopt.sampleSize) return; 1283 } 1284 } 1285 } 1286 } 1287 1288 /** Reservoir sampling via Algorithm R 1289 * 1290 * This is an implementation of reservoir sampling using what is commonly known as 1291 * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of 1292 * Computer Programming, Volume 2: Seminumerical Algorithms". More information about 1293 * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with 1294 * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling" 1295 * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R). 1296 * 1297 * Algorithm R is used for unweighted sampling without replacement. The heap-based 1298 * algorithm in reservoirSamplingViaHeap is used for weighted sampling. 1299 * 1300 * The classic algorithm stops after identifying the selected set of items. This 1301 * implementation goes one step further and randomizes the order of the selected 1302 * lines. This is consistent with shuffling (line order randomization), a primary 1303 * tsv-sample use-case. 1304 * 1305 * This algorithm is faster than reservoirSamplingViaHeap when the sample size 1306 * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size. 1307 * Insertion in this algorithm is O(1). Similarly, generating the random order in the 1308 * heap is O(k * log k), while in this algorithm the final randomization step is O(k). 1309 * 1310 * This speed advantage may be offset a certain amount by using a more expensive random 1311 * value generator. reservoirSamplingViaHeap generates values between zero and one, 1312 * whereas reservoirSamplingAlgorithmR generates random integers over and ever growing 1313 * interval. The latter is expected to be more expensive. This is consistent with 1314 * performance tests indicating that reservoirSamplingViaHeap is faster when using 1315 * small-to-medium size reservoirs and large input streams. 1316 */ 1317 void reservoirSamplingAlgorithmR(Flag!"preserveInputOrder" preserveInputOrder, OutputRange) 1318 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1319 if (isOutputRange!(OutputRange, char)) 1320 { 1321 import std.meta : AliasSeq; 1322 import std.random : Random = Mt19937, randomShuffle, uniform; 1323 import std.algorithm : sort; 1324 import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, 1325 InputSourceRange, throwIfWindowsNewlineOnUnix; 1326 1327 assert(cmdopt.sampleSize > 0); 1328 assert(!cmdopt.hasWeightField); 1329 assert(!cmdopt.compatibilityMode); 1330 assert(!cmdopt.printRandom); 1331 assert(!cmdopt.genRandomInorder); 1332 1333 assert(!cmdopt.inputSources.empty); 1334 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 1335 1336 static struct Entry(Flag!"preserveInputOrder" preserveInputOrder) 1337 { 1338 const(char)[] line; 1339 static if (preserveInputOrder) ulong lineNumber; 1340 } 1341 1342 Entry!preserveInputOrder[] reservoir; 1343 auto reservoirAppender = appender(&reservoir); 1344 reservoirAppender.reserve(cmdopt.sampleSize); 1345 1346 auto randomGenerator = Random(cmdopt.seed); 1347 1348 /* First header is read during command line argument processing. */ 1349 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 1350 { 1351 auto inputStream = cmdopt.inputSources.front; 1352 1353 outputStream.put(inputStream.header); 1354 outputStream.put("\n"); 1355 1356 /* Immediately flush the header so subsequent processes in a unix command 1357 * pipeline see it early. This helps provide timely error messages. 1358 */ 1359 static if (isFlushableOutputRange!OutputRange) outputStream.flush; 1360 } 1361 1362 /* Process each line. */ 1363 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 1364 ulong totalLineNum = 0; 1365 1366 foreach (inputStream; cmdopt.inputSources) 1367 { 1368 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 1369 1370 foreach (ulong fileLineNum, line; 1371 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) 1372 { 1373 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); 1374 1375 /* Add lines to the reservoir until the reservoir is filled. 1376 * After that lines are added with decreasing likelihood, based on 1377 * the total number of lines seen. If added to the reservoir, the 1378 * line replaces a randomly chosen existing line. 1379 */ 1380 static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum); 1381 else alias entryCTArgs = AliasSeq!(); 1382 1383 if (totalLineNum < cmdopt.sampleSize) 1384 { 1385 reservoirAppender ~= Entry!preserveInputOrder(line.idup, entryCTArgs); 1386 } 1387 else 1388 { 1389 immutable size_t i = uniform(0, totalLineNum, randomGenerator); 1390 if (i < reservoir.length) 1391 { 1392 reservoir[i] = Entry!preserveInputOrder(line.idup, entryCTArgs); 1393 } 1394 } 1395 1396 ++totalLineNum; 1397 } 1398 } 1399 1400 /* Done with input. The sample is in the reservoir. Update the order and print. */ 1401 1402 static if (preserveInputOrder) 1403 { 1404 reservoir.sort!((a, b) => a.lineNumber < b.lineNumber); 1405 } 1406 else 1407 { 1408 reservoir.randomShuffle(randomGenerator); 1409 } 1410 1411 foreach (ref entry; reservoir) 1412 { 1413 outputStream.put(entry.line); 1414 outputStream.put("\n"); 1415 } 1416 } 1417 1418 /** Shuffling command handler. Invokes the appropriate shuffle (line order 1419 * randomization) routine based on the command line arguments. 1420 * 1421 * Shuffling has similarities to random sampling, but the algorithms used are 1422 * different. Random sampling selects a subset, only the current subset selection 1423 * needs to be kept in memory. This is supported by reservoir sampling. By contrast, 1424 * shuffling needs to hold all input in memory, so it works better to read all lines 1425 * into memory at once and then shuffle. 1426 * 1427 * Two different algorithms are used. Array shuffling is used for unweighted shuffling. 1428 * Sorting plus random weight assignments is used for weighted shuffling and when 1429 * compatibility mode is being used. 1430 * 1431 * The algorithms used here are all limited by available memory. 1432 */ 1433 void shuffleCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1434 if (isOutputRange!(OutputRange, char)) 1435 { 1436 if (cmdopt.hasWeightField) 1437 { 1438 randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream); 1439 } 1440 else if (cmdopt.compatibilityMode) 1441 { 1442 randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream); 1443 } 1444 else 1445 { 1446 randomizeLinesViaShuffle(cmdopt, outputStream); 1447 } 1448 } 1449 1450 /** Shuffle all input lines by assigning random weights and sorting. 1451 * 1452 * randomizeLinesViaSort reads in all input lines and writes them out in random order. 1453 * The algorithm works by assigning a random value to each line and sorting. Both 1454 * weighted and unweighted shuffling are supported. 1455 * 1456 * Notes: 1457 * $(LIST 1458 * * For unweighted shuffling randomizeLinesViaShuffle is faster and should be used 1459 * unless compatibility mode is needed. 1460 * * This routine is significantly faster than heap-based reservoir sampling in the 1461 * case where the entire file is being read. 1462 * * Input data must be read entirely in memory. Disk oriented techniques are needed 1463 * when data sizes get too large for available memory. One option is to generate 1464 * random values for each line, e.g. --gen-random-inorder, and sort with a disk- 1465 * backed sort program like GNU sort. 1466 * ) 1467 */ 1468 void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange) 1469 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1470 if (isOutputRange!(OutputRange, char)) 1471 { 1472 import std.algorithm : map, sort; 1473 1474 static if (isWeighted) assert(cmdopt.hasWeightField); 1475 else assert(!cmdopt.hasWeightField); 1476 1477 assert(cmdopt.sampleSize == 0); 1478 1479 /* 1480 * Read all file data into memory. Then split the data into lines and assign a 1481 * random value to each line. readFileData also writes the first header line. 1482 */ 1483 const fileData = readFileData!(Yes.hasRandomValue)(cmdopt, outputStream); 1484 auto inputLines = fileData.identifyInputLines!(Yes.hasRandomValue, isWeighted)(cmdopt); 1485 1486 /* 1487 * Sort by the weight and output the lines. 1488 */ 1489 inputLines.sort!((a, b) => a.randomValue > b.randomValue); 1490 1491 foreach (lineEntry; inputLines) 1492 { 1493 if (cmdopt.printRandom) 1494 { 1495 outputStream.formatRandomValue(lineEntry.randomValue); 1496 outputStream.put(cmdopt.delim); 1497 } 1498 outputStream.put(lineEntry.data); 1499 outputStream.put("\n"); 1500 } 1501 } 1502 1503 /** Shuffle (randomize) all input lines using a shuffling algorithm. 1504 * 1505 * All lines in files and/or standard input are read in and written out in random 1506 * order. This routine uses array shuffling, which is faster than sorting. It is a 1507 * good alternative to randomizeLinesViaSort when doing unweighted shuffling (the 1508 * most common case). 1509 * 1510 * Input data size is limited by available memory. Disk oriented techniques are needed 1511 * when data sizes are larger. For example, generating random values line-by-line (ala 1512 * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort. 1513 * 1514 * This routine does not support random value printing or compatibility-mode. 1515 */ 1516 void randomizeLinesViaShuffle(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1517 if (isOutputRange!(OutputRange, char)) 1518 { 1519 import std.algorithm : map; 1520 import std.random : Random = Mt19937, randomShuffle; 1521 1522 assert(cmdopt.sampleSize == 0); 1523 assert(!cmdopt.hasWeightField); 1524 assert(!cmdopt.printRandom); 1525 assert(!cmdopt.genRandomInorder); 1526 1527 /* 1528 * Read all file data into memory and split into lines. 1529 */ 1530 const fileData = readFileData!(No.hasRandomValue)(cmdopt, outputStream); 1531 auto inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt); 1532 1533 /* 1534 * Randomly shuffle and print each line. 1535 * 1536 * Note: Also tried randomCover, but that was exceedingly slow. 1537 */ 1538 import std.random : randomShuffle; 1539 1540 auto randomGenerator = Random(cmdopt.seed); 1541 inputLines.randomShuffle(randomGenerator); 1542 1543 foreach (ref line; inputLines) 1544 { 1545 outputStream.put(line.data); 1546 outputStream.put("\n"); 1547 } 1548 } 1549 1550 /** Simple random sampling with replacement. 1551 * 1552 * All lines in files and/or standard input are read in. Then random lines are selected 1553 * one at a time and output. Lines can be selected multiple times. This process continues 1554 * until the desired number of samples (--n|num) has been output. Output continues 1555 * indefinitely if a sample size was not provided. 1556 */ 1557 void simpleRandomSamplingWithReplacement(OutputRange) 1558 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1559 if (isOutputRange!(OutputRange, char)) 1560 { 1561 import std.algorithm : map; 1562 import std.random : Random = Mt19937, uniform; 1563 1564 /* 1565 * Read all file data into memory and split the data into lines. 1566 */ 1567 const fileData = readFileData!(No.hasRandomValue)(cmdopt, outputStream); 1568 const inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt); 1569 1570 if (inputLines.length > 0) 1571 { 1572 auto randomGenerator = Random(cmdopt.seed); 1573 1574 /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */ 1575 size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize; 1576 while (numLeft != 0) 1577 { 1578 immutable size_t index = uniform(0, inputLines.length, randomGenerator); 1579 outputStream.put(inputLines[index].data); 1580 outputStream.put("\n"); 1581 if (cmdopt.sampleSize != 0) numLeft--; 1582 } 1583 } 1584 } 1585 1586 /** A container holding data read from a file or standard input. 1587 * 1588 * The InputBlock struct is used to represent a block of data read from a file or 1589 * standard input. An array of InputBlocks is returned by readFileData. Typically one 1590 * block per file. Multiple blocks are used for standard input and when the file size 1591 * cannot be determined. Individual lines are not allowed to span blocks. The blocks 1592 * allocated to an individual file are numbered starting with zero. 1593 * 1594 * See readFileData() for more information. 1595 */ 1596 static struct InputBlock 1597 { 1598 string filename; /// Original filename or path. "-" denotes standard input. 1599 size_t fileBlockNumber; /// Zero-based block number for the file. 1600 char[] data; /// The actual data. Newline terminated or last block for the file. 1601 } 1602 1603 /** Read data from one or more files. This routine is used by algorithms needing to 1604 * read all data into memory. 1605 * 1606 * readFileData reads in all data from a set of files. Data is returned as an array 1607 * of InputBlock structs. Normally one InputBlock per file, sized to match the size 1608 * of the file. Standard input is read in one or more blocks, as are files whose size 1609 * cannot be determined. Multiple blocks are used in these last two cases to avoid 1610 * expensive memory reallocations. This is not necessary when file size is known as 1611 * the necessary memory can be preallocated. 1612 * 1613 * Individual lines never span multiple blocks, and newlines are preserved. This 1614 * means that each block starts at the beginning of a line and ends with a newline 1615 * unless the end of a file has been reached. 1616 * 1617 * Each file gets its own block. Prior to using InputSourceRange this was so header 1618 * processing can be done. With InputSourceRange the header is read separately, so 1619 * this could be changed. 1620 */ 1621 InputBlock[] readFileData(HasRandomValue hasRandomValue, OutputRange) 1622 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1623 if (isOutputRange!(OutputRange, char)) 1624 { 1625 import std.algorithm : find, min; 1626 import std.range : retro; 1627 import tsv_utils.common.utils : InputSourceRange, isFlushableOutputRange, 1628 throwIfWindowsNewlineOnUnix; 1629 1630 static if(!hasRandomValue) assert(!cmdopt.printRandom); 1631 1632 assert(!cmdopt.inputSources.empty); 1633 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 1634 1635 /* First header is read during command line argument processing. */ 1636 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 1637 { 1638 auto inputStream = cmdopt.inputSources.front; 1639 1640 if (cmdopt.printRandom) 1641 { 1642 outputStream.put(cmdopt.randomValueHeader); 1643 outputStream.put(cmdopt.delim); 1644 } 1645 outputStream.put(inputStream.header); 1646 outputStream.put("\n"); 1647 1648 /* Immediately flush the header so subsequent processes in a unix command 1649 * pipeline see it early. This helps provide timely error messages. 1650 */ 1651 static if (isFlushableOutputRange!OutputRange) outputStream.flush; 1652 } 1653 1654 enum BlockSize = 1024L * 1024L * 1024L; // 1 GB. ('L' notation avoids overflow w/ 2GB+ sizes.) 1655 enum ReadSize = 1024L * 128L; 1656 enum NewlineSearchSize = 1024L * 16L; 1657 1658 InputBlock[] blocks; 1659 auto blocksAppender = appender(&blocks); 1660 blocksAppender.reserve(cmdopt.inputSources.length); // At least one block per file. 1661 1662 ubyte[] rawReadBuffer = new ubyte[ReadSize]; 1663 1664 foreach (inputStream; cmdopt.inputSources) 1665 { 1666 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 1667 1668 /* If the file size can be determined then read it as a single block. 1669 * Otherwise read as multiple blocks. File.size() returns ulong.max 1670 * if file size cannot be determined, so we'll combine that check 1671 * with the standard input case. 1672 */ 1673 1674 immutable ulong filesize = inputStream.isStdin ? ulong.max : inputStream.file.size; 1675 auto ifile = inputStream.file; 1676 1677 if (filesize != ulong.max) 1678 { 1679 readFileDataAsOneBlock(inputStream.name, ifile, filesize, 1680 blocksAppender, rawReadBuffer); 1681 } 1682 else 1683 { 1684 readFileDataAsMultipleBlocks( 1685 inputStream.name, ifile, blocksAppender, rawReadBuffer, 1686 BlockSize, NewlineSearchSize); 1687 } 1688 } 1689 return blocks; 1690 } 1691 1692 /* readFileData() helper function. Read data from a File handle as a single block. The 1693 * new block is appended to an existing InputBlock[] array. 1694 * 1695 * readFileDataAsOneBlocks is part of the readFileData logic. It handles the case 1696 * where a file is being read as a single block. Normally initialBlockSize is passed 1697 * as the size of the file. 1698 * 1699 * This routine has been separated out to enable unit testing. At present it is not 1700 * intended as a general API. See readFileData for more info. 1701 */ 1702 private void readFileDataAsOneBlock( 1703 string filename, 1704 ref File ifile, 1705 const ulong initialBlockSize, 1706 ref RefAppender!(InputBlock[]) blocksAppender, 1707 ref ubyte[] rawReadBuffer) 1708 { 1709 blocksAppender.put(InputBlock(filename, 0)); 1710 auto dataAppender = appender(&(blocksAppender.data[$-1].data)); 1711 dataAppender.reserve(initialBlockSize); 1712 1713 foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer)) 1714 { 1715 dataAppender.put(cast(char[]) buffer); 1716 } 1717 } 1718 1719 /* readFileData() helper function. Read data from a File handle as one or more blocks. 1720 * Blocks are appended to an existing InputBlock[] array. 1721 * 1722 * readFileDataAsMultipleBlocks is part of the readFileData logic. It handles the case 1723 * where a file or standard input is being read as a series of blocks. This is the 1724 * standard approach for standard input, but also applies when the file size cannot be 1725 * determined. 1726 * 1727 * This routine has been separated out to enable unit testing. At present it is not 1728 * intended as a general API. See readFileData for more info. 1729 */ 1730 private void readFileDataAsMultipleBlocks( 1731 string filename, 1732 ref File ifile, 1733 ref RefAppender!(InputBlock[]) blocksAppender, 1734 ref ubyte[] rawReadBuffer, 1735 const size_t blockSize, 1736 const size_t newlineSearchSize) 1737 { 1738 import std.algorithm : find, min; 1739 import std.range : retro; 1740 1741 assert(ifile.isOpen); 1742 1743 /* Create a new block for the file and an Appender for writing data. 1744 */ 1745 blocksAppender.put(InputBlock(filename, 0)); 1746 auto dataAppender = appender(&(blocksAppender.data[$-1].data)); 1747 dataAppender.reserve(blockSize); 1748 size_t blockNumber = 0; 1749 1750 /* Read all the data and copy it to an InputBlock. */ 1751 foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer)) 1752 { 1753 assert(blockNumber == blocksAppender.data[$-1].fileBlockNumber); 1754 1755 immutable size_t remainingCapacity = dataAppender.capacity - dataAppender.data.length; 1756 1757 if (buffer.length <= remainingCapacity) 1758 { 1759 dataAppender.put(cast(char[]) buffer); 1760 } 1761 else 1762 { 1763 /* Look for the last newline in the input buffer that fits in remaining 1764 * capacity of the block. 1765 */ 1766 auto searchRegion = buffer[0 .. remainingCapacity]; 1767 auto appendRegion = searchRegion.retro.find('\n').source; 1768 1769 if (appendRegion.length > 0) 1770 { 1771 /* Copy the first part of the read buffer to the block. */ 1772 dataAppender.put(cast(char[]) appendRegion); 1773 1774 /* Create a new InputBlock and copy the remaining data to it. */ 1775 blockNumber++; 1776 blocksAppender.put(InputBlock(filename, blockNumber)); 1777 dataAppender = appender(&(blocksAppender.data[$-1].data)); 1778 dataAppender.reserve(blockSize); 1779 dataAppender.put(cast(char[]) buffer[appendRegion.length .. $]); 1780 1781 assert(blocksAppender.data.length >= 2); 1782 assert(blocksAppender.data[$-2].data[$-1] == '\n'); 1783 } 1784 else 1785 { 1786 /* Search backward in the current block for a newline. If found, it 1787 * becomes the last newline in the current block. Anything following 1788 * it is moved to the block. If a newline is not found, simply append 1789 * to the current block and let it grow. We'll only search backward 1790 * so far. 1791 */ 1792 immutable size_t currBlockLength = blocksAppender.data[$-1].data.length; 1793 immutable size_t searchLength = min(currBlockLength, newlineSearchSize); 1794 immutable size_t searchStart = currBlockLength - searchLength; 1795 auto blockSearchRegion = blocksAppender.data[$-1].data[searchStart .. $]; 1796 auto lastNewlineOffset = blockSearchRegion.retro.find('\n').source.length; 1797 1798 if (lastNewlineOffset != 0) 1799 { 1800 /* Create a new InputBlock. The previous InputBlock is then found 1801 * at blocksAppender.data[$-2]. It may be a physically different 1802 * struct (a copy) if the blocks array gets reallocated. 1803 */ 1804 blockNumber++; 1805 blocksAppender.put(InputBlock(filename, blockNumber)); 1806 dataAppender = appender(&(blocksAppender.data[$-1].data)); 1807 dataAppender.reserve(blockSize); 1808 1809 /* Copy data following the newline from the last block to the new 1810 * block. Then append the current read buffer. 1811 */ 1812 immutable size_t moveRegionStart = searchStart + lastNewlineOffset; 1813 dataAppender.put(blocksAppender.data[$-2].data[moveRegionStart .. $]); 1814 dataAppender.put(cast(char[]) buffer); 1815 1816 /* Now delete the moved region from the last block. */ 1817 blocksAppender.data[$-2].data.length = moveRegionStart; 1818 1819 assert(blocksAppender.data.length >= 2); 1820 assert(blocksAppender.data[$-2].data[$-1] == '\n'); 1821 } 1822 else 1823 { 1824 /* Give up. Allow the current block to grow. */ 1825 dataAppender.put(cast(char[]) buffer); 1826 } 1827 } 1828 } 1829 } 1830 } 1831 1832 /** HasRandomValue is a boolean flag used at compile time by identifyInputLines to 1833 * distinguish use cases needing random value assignments from those that don't. 1834 */ 1835 alias HasRandomValue = Flag!"hasRandomValue"; 1836 1837 /** An InputLine array is returned by identifyInputLines to represent each non-header line 1838 * line found in a FileData array. The 'data' element contains the line. A 'randomValue' 1839 * line is included if random values are being generated. 1840 */ 1841 static struct InputLine(HasRandomValue hasRandomValue) 1842 { 1843 const(char)[] data; 1844 static if (hasRandomValue) double randomValue; 1845 } 1846 1847 /** identifyInputLines is used by algorithms that read all files into memory prior to 1848 * processing. It does the initial processing of the file data. 1849 * 1850 * Two main tasks are performed. One is splitting all input data into lines. The second 1851 * is assigning a random value to the line, if random values are being generated. 1852 * 1853 * The key input is an InputBlock array. Normally one block for each file, but standard 1854 * input may have multiple blocks. 1855 * 1856 * The return value is an array of InputLine structs. The struct will have a 'randomValue' 1857 * member if random values are being assigned. 1858 */ 1859 InputLine!hasRandomValue[] identifyInputLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted) 1860 (const ref InputBlock[] inputBlocks, ref TsvSampleOptions cmdopt) 1861 { 1862 import std.algorithm : splitter; 1863 import std.array : appender; 1864 import std.random : Random = Mt19937, uniform01; 1865 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 1866 1867 static assert(hasRandomValue || !isWeighted); 1868 static if(!hasRandomValue) assert(!cmdopt.printRandom); 1869 1870 InputLine!hasRandomValue[] inputLines; 1871 1872 auto linesAppender = appender(&inputLines); 1873 static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed); 1874 1875 /* Note: fileLineNum is zero-based here. One-based in most other code in this file. */ 1876 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 1 : 0; 1877 size_t fileLineNum = fileBodyStartLine; 1878 1879 foreach (block; inputBlocks) 1880 { 1881 /* Drop the last newline to avoid adding an extra empty line. */ 1882 const data = (block.data.length > 0 && block.data[$-1] == '\n') ? 1883 block.data[0 .. $-1] : block.data; 1884 1885 if (block.fileBlockNumber == 0) fileLineNum = fileBodyStartLine; 1886 1887 foreach (ref line; data.splitter('\n')) 1888 { 1889 fileLineNum++; 1890 1891 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, block.filename, fileLineNum); 1892 1893 static if (!hasRandomValue) 1894 { 1895 linesAppender.put(InputLine!hasRandomValue(line)); 1896 } 1897 else 1898 { 1899 static if (!isWeighted) 1900 { 1901 immutable double randomValue = uniform01(randomGenerator); 1902 } 1903 else 1904 { 1905 immutable double lineWeight = 1906 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, 1907 block.filename, fileLineNum); 1908 immutable double randomValue = 1909 (lineWeight > 0.0) 1910 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1911 : 0.0; 1912 } 1913 1914 linesAppender.put(InputLine!hasRandomValue(line, randomValue)); 1915 } 1916 } 1917 } 1918 1919 return inputLines; 1920 } 1921 1922 1923 /* Unit tests for ReadFileData. These tests focus on multiple InputBlock scenarios. 1924 * Other use paths are well tested by the tests at the end cases. 1925 */ 1926 unittest 1927 { 1928 import tsv_utils.common.unittest_utils; 1929 import std.algorithm : equal, find, joiner, splitter; 1930 import std.array : appender; 1931 import std.file : rmdirRecurse; 1932 import std.path : buildPath; 1933 import std.range : repeat; 1934 1935 auto rfdTestDir = makeUnittestTempDir("tsv_sample_readFileData"); 1936 scope(exit) rfdTestDir.rmdirRecurse; 1937 1938 char[] file1Data; 1939 char[] file2Data; 1940 char[] file3Data; 1941 1942 auto app1 = appender(&file1Data); 1943 auto app2 = appender(&file2Data); 1944 auto app3 = appender(&file3Data); 1945 1946 /* File 1: 1000 short lines. */ 1947 app1.put("\n".repeat(100).joiner); 1948 app1.put("x\n".repeat(100).joiner); 1949 app1.put("yz\n".repeat(100).joiner); 1950 app1.put("pqr\n".repeat(100).joiner); 1951 app1.put("a\nbc\ndef\n".repeat(100).joiner); 1952 app1.put('\n'.repeat(100)); 1953 app1.put("z\n".repeat(100).joiner); 1954 app1.put("xy\n".repeat(100).joiner); 1955 1956 /* File 2: 500 longer lines. */ 1957 app2.put( 1958 "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n" 1959 .repeat(100) 1960 .joiner); 1961 app2.put( 1962 "|abcdefghijklmnopqrstuv|\n|0123456789|\n|0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ|\n|abcdefghijklmnopqrstuvwxyz|\n" 1963 .repeat(100) 1964 .joiner); 1965 app2.put( 1966 "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n" 1967 .repeat(100) 1968 .joiner); 1969 1970 /* File 3: 1000 mixed length lines. */ 1971 app3.put("\n\n|abcde|\n1\n12\n123\n|abcdefghijklmnop|\n|xyz|\n0123456789\nX\n".repeat(100).joiner); 1972 1973 string file1Path = buildPath(rfdTestDir, "file1.txt"); 1974 string file2Path = buildPath(rfdTestDir, "file2.txt"); 1975 string file3Path = buildPath(rfdTestDir, "file3.txt"); 1976 1977 try 1978 { 1979 auto ofile1 = File(file1Path, "w"); 1980 ofile1.write(file1Data); 1981 } 1982 catch (Exception e) assert(false, format("Failed to write file: %s.\n Error: %s", file1Path, e.msg)); 1983 1984 try 1985 { 1986 auto ofile2 = File(file2Path, "w"); 1987 ofile2.write(file2Data); 1988 } 1989 catch (Exception e) assert(false, format("Failed to write file: %s.\n Error: %s", file2Path, e.msg)); 1990 1991 try 1992 { 1993 auto ofile3 = File(file3Path, "w"); 1994 ofile3.write(file3Data); 1995 } 1996 catch (Exception e) assert(false, format("Failed to write file: %s.\n Error: %s", file3Path, e.msg)); 1997 1998 auto allData = file1Data ~ file2Data ~ file3Data; 1999 auto expectedLines = allData.splitter('\n').array[0 .. $-1]; 2000 2001 auto file2DataNoHeader = (file2Data.find('\n'))[1 .. $]; 2002 auto file3DataNoHeader = (file3Data.find('\n'))[1 .. $]; 2003 auto allDataUsingHeader = file1Data ~ file2DataNoHeader ~ file3DataNoHeader; 2004 auto expectedLinesUsingHeader = allDataUsingHeader.splitter('\n').array[0 .. $-1]; 2005 2006 assert(expectedLines.length == expectedLinesUsingHeader.length + 2); 2007 2008 TsvSampleOptions cmdoptNoHeader; 2009 auto noHeaderCmdArgs = ["unittest", file1Path]; 2010 auto r1 = cmdoptNoHeader.processArgs(noHeaderCmdArgs); 2011 assert(r1[0], format("Invalid command lines arg: '%s'.", noHeaderCmdArgs)); 2012 2013 TsvSampleOptions cmdoptYesHeader; 2014 auto yesHeaderCmdArgs = ["unittest", "--header", file1Path]; 2015 auto r2 = cmdoptYesHeader.processArgs(yesHeaderCmdArgs); 2016 assert(r2[0], format("Invalid command lines arg: '%s'.", yesHeaderCmdArgs)); 2017 2018 auto outputStream = appender!(char[])(); 2019 2020 { 2021 /* Reading as single blocks. */ 2022 ubyte[] rawReadBuffer = new ubyte[256]; 2023 InputBlock[] blocks; 2024 auto blocksAppender = appender(&blocks); 2025 blocksAppender.reserve(3); 2026 foreach (f; [ file1Path, file2Path, file3Path ]) 2027 { 2028 auto ifile = f.File; 2029 ulong filesize = ifile.size; 2030 if (filesize == ulong.max) filesize = 1000; 2031 readFileDataAsOneBlock(f, ifile, filesize, blocksAppender, rawReadBuffer); 2032 } 2033 auto inputLines = 2034 identifyInputLines!(No.hasRandomValue, No.isWeighted)( 2035 blocks, cmdoptNoHeader); 2036 2037 assert(equal!((a, b) => a.data == b)(inputLines, expectedLines)); 2038 } 2039 2040 { 2041 /* Reading as multiple blocks. */ 2042 foreach (size_t searchSize; [ 0, 1, 2, 64 ]) 2043 { 2044 foreach (size_t blockSize; [ 1, 2, 16, 64, 256 ]) 2045 { 2046 foreach (size_t readSize; [ 1, 2, 8, 32 ]) 2047 { 2048 ubyte[] rawReadBuffer = new ubyte[readSize]; 2049 InputBlock[] blocks; 2050 auto blocksAppender = appender(&blocks); 2051 blocksAppender.reserve(3); 2052 foreach (f; [ file1Path, file2Path, file3Path ]) 2053 { 2054 auto ifile = f.File; 2055 readFileDataAsMultipleBlocks(f, ifile, blocksAppender, 2056 rawReadBuffer, blockSize, searchSize); 2057 } 2058 auto inputLines = 2059 identifyInputLines!(No.hasRandomValue, No.isWeighted)( 2060 blocks, cmdoptNoHeader); 2061 2062 assert(equal!((a, b) => a.data == b)(inputLines, expectedLines)); 2063 } 2064 } 2065 } 2066 } 2067 version(none) { 2068 { 2069 /* Reading as multiple blocks, with header processing. */ 2070 const size_t readSize = 32; 2071 const size_t blockSize = 48; 2072 const size_t searchSize = 16; 2073 2074 ubyte[] rawReadBuffer = new ubyte[readSize]; 2075 InputBlock[] blocks; 2076 auto blocksAppender = appender(&blocks); 2077 blocksAppender.reserve(3); 2078 foreach (f; [ file1Path, file2Path, file3Path ]) 2079 { 2080 auto ifile = f.File; 2081 readFileDataAsMultipleBlocks(f, ifile, blocksAppender, 2082 rawReadBuffer, blockSize, searchSize); 2083 } 2084 auto inputLines = 2085 identifyInputLines!(No.hasRandomValue, No.isWeighted)( 2086 blocks, cmdoptYesHeader); 2087 2088 assert(outputStream.data == expectedLinesUsingHeader[0] ~ '\n'); 2089 assert(equal!((a, b) => a.data == b)(inputLines, expectedLinesUsingHeader[1 .. $])); 2090 } 2091 } 2092 } 2093 2094 /** Write a floating point random value to an output stream. 2095 * 2096 * This routine is used for floating point random value printing. This routine writes 2097 * 17 significant digits, the range available in doubles. This routine prefers decimal 2098 * format, without exponents. It will generate somewhat large precision numbers, 2099 * currently up to 28 digits, before switching to exponents. 2100 * 2101 * The primary reason for this approach is to enable faster sorting on random values 2102 * by GNU sort and similar external sorting programs. GNU sort is dramatically faster 2103 * on decimal format numeric sorts ('n' switch) than general numeric sorts ('g' switch). 2104 * The 'general numeric' handles exponential notation. The difference is 5-10x. 2105 * 2106 * Random values generated by Bernoulli sampling are nearly always greater than 1e-12. 2107 * No examples less than 1e-09 were seen in hundred of millions of trials. Similar 2108 * results were seen with weighted sampling with integer weights. The same is not true 2109 * with floating point weights. These produce quite large exponents. However, even 2110 * for floating point weights this can be useful. For random weights [0,1] less than 5% 2111 * will be less than 1e-12 and use exponential notation. 2112 */ 2113 void formatRandomValue(OutputRange)(auto ref OutputRange outputStream, double value) 2114 if (isOutputRange!(OutputRange, char)) 2115 { 2116 import std.format : formatValue, singleSpec; 2117 2118 immutable spec17f = singleSpec("%.17f"); 2119 immutable spec18f = singleSpec("%.18f"); 2120 immutable spec19f = singleSpec("%.19f"); 2121 immutable spec20f = singleSpec("%.20f"); 2122 immutable spec21f = singleSpec("%.21f"); 2123 immutable spec22f = singleSpec("%.22f"); 2124 immutable spec23f = singleSpec("%.23f"); 2125 immutable spec24f = singleSpec("%.24f"); 2126 immutable spec25f = singleSpec("%.25f"); 2127 immutable spec26f = singleSpec("%.26f"); 2128 immutable spec27f = singleSpec("%.27f"); 2129 immutable spec28f = singleSpec("%.28f"); 2130 2131 immutable spec17g = singleSpec("%.17g"); 2132 2133 immutable formatSpec = 2134 (value >= 1e-01) ? spec17f : 2135 (value >= 1e-02) ? spec18f : 2136 (value >= 1e-03) ? spec19f : 2137 (value >= 1e-04) ? spec20f : 2138 (value >= 1e-05) ? spec21f : 2139 (value >= 1e-06) ? spec22f : 2140 (value >= 1e-07) ? spec23f : 2141 (value >= 1e-08) ? spec24f : 2142 (value >= 1e-09) ? spec25f : 2143 (value >= 1e-10) ? spec26f : 2144 (value >= 1e-11) ? spec27f : 2145 (value >= 1e-12) ? spec28f : spec17g; 2146 2147 outputStream.formatValue(value, formatSpec); 2148 } 2149 2150 @safe unittest 2151 { 2152 void testFormatValue(double value, string expected) 2153 { 2154 import std.array : appender; 2155 2156 auto s = appender!string(); 2157 s.formatRandomValue(value); 2158 assert(s.data == expected, 2159 format("[testFormatValue] value: %g; expected: %s; actual: %s", value, expected, s.data)); 2160 } 2161 2162 testFormatValue(1.0, "1.00000000000000000"); 2163 testFormatValue(0.1, "0.10000000000000001"); 2164 testFormatValue(0.01, "0.010000000000000000"); 2165 testFormatValue(1e-03, "0.0010000000000000000"); 2166 testFormatValue(1e-04, "0.00010000000000000000"); 2167 testFormatValue(1e-05, "0.000010000000000000001"); 2168 testFormatValue(1e-06, "0.0000010000000000000000"); 2169 testFormatValue(1e-07, "0.00000010000000000000000"); 2170 testFormatValue(1e-08, "0.000000010000000000000000"); 2171 testFormatValue(1e-09, "0.0000000010000000000000001"); 2172 testFormatValue(1e-10, "0.00000000010000000000000000"); 2173 testFormatValue(1e-11, "0.000000000009999999999999999"); 2174 testFormatValue(1e-12, "0.0000000000010000000000000000"); 2175 testFormatValue(1e-13, "1e-13"); 2176 testFormatValue(1e-14, "1e-14"); 2177 testFormatValue(12345678901234567e-15, "12.34567890123456735"); 2178 testFormatValue(12345678901234567e-16, "1.23456789012345669"); 2179 testFormatValue(12345678901234567e-17, "0.12345678901234566"); 2180 testFormatValue(12345678901234567e-18, "0.012345678901234567"); 2181 testFormatValue(12345678901234567e-19, "0.0012345678901234567"); 2182 testFormatValue(12345678901234567e-20, "0.00012345678901234567"); 2183 testFormatValue(12345678901234567e-21, "0.000012345678901234568"); 2184 testFormatValue(12345678901234567e-22, "0.0000012345678901234567"); 2185 testFormatValue(12345678901234567e-23, "0.00000012345678901234566"); 2186 testFormatValue(12345678901234567e-24, "0.000000012345678901234567"); 2187 testFormatValue(12345678901234567e-25, "0.0000000012345678901234566"); 2188 testFormatValue(12345678901234567e-26, "0.00000000012345678901234568"); 2189 testFormatValue(12345678901234567e-27, "0.000000000012345678901234567"); 2190 testFormatValue(12345678901234567e-28, "0.0000000000012345678901234567"); 2191 testFormatValue(12345678901234567e-29, "1.2345678901234566e-13"); 2192 } 2193 2194 2195 /** Convenience function for extracting a single field from a line. See 2196 * [tsv_utils.common.utils.getTsvFieldValue] for details. This wrapper creates error 2197 * text tailored for this program. 2198 */ 2199 import std.traits : isSomeChar; 2200 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, ulong lineNum) pure @safe 2201 if (isSomeChar!C) 2202 { 2203 import std.conv : ConvException, to; 2204 import tsv_utils.common.utils : getTsvFieldValue; 2205 2206 T val; 2207 try 2208 { 2209 val = getTsvFieldValue!T(line, fieldIndex, delim); 2210 } 2211 catch (ConvException exc) 2212 { 2213 throw new Exception( 2214 format("Could not process line: %s\n File: %s Line: %s%s", 2215 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 2216 (lineNum == 1) ? "\n Is this a header line? Use --H|header to skip." : "")); 2217 } 2218 catch (Exception exc) 2219 { 2220 /* Not enough fields on the line. */ 2221 throw new Exception( 2222 format("Could not process line: %s\n File: %s Line: %s", 2223 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum)); 2224 } 2225 2226 return val; 2227 } 2228 2229 @safe unittest 2230 { 2231 /* getFieldValue unit tests. getTsvFieldValue has it's own tests. 2232 * These tests make basic sanity checks on the getFieldValue wrapper. 2233 */ 2234 import std.exception; 2235 2236 assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123); 2237 assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4); 2238 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1)); 2239 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2)); 2240 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1)); 2241 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2)); 2242 } 2243 2244 /* Unit tests for the main program start here. 2245 * 2246 * Portability note: Many of the tests here rely on generating consistent random numbers 2247 * across different platforms when using the same random seed. So far this has succeeded 2248 * on several different platform, compiler, and library versions. However, it is certainly 2249 * possible this condition will not hold on other platforms. 2250 * 2251 * For tsv-sample, this portability implies generating the same results on different 2252 * platforms when using the same random seed. This is NOT part of tsv-sample guarantees, 2253 * but it is convenient for testing. If platforms are identified that do not generate 2254 * the same results these tests will need to be adjusted. 2255 */ 2256 version(unittest) 2257 { 2258 /* Unit test helper functions. */ 2259 2260 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 2261 import std.conv : to; 2262 2263 void testTsvSample(string[] cmdArgs, string[][] expected) 2264 { 2265 import std.array : appender; 2266 2267 assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty."); 2268 2269 auto formatAssertMessage(T...)(string msg, T formatArgs) 2270 { 2271 auto formatString = "[testTsvSample] %s: " ~ msg; 2272 return format(formatString, cmdArgs[0], formatArgs); 2273 } 2274 2275 TsvSampleOptions cmdopt; 2276 auto savedCmdArgs = cmdArgs.to!string; 2277 auto r = cmdopt.processArgs(cmdArgs); 2278 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 2279 auto output = appender!(char[])(); 2280 2281 tsvSample(cmdopt, output); // This invokes the main code line. 2282 2283 auto expectedOutput = expected.tsvDataToString; 2284 2285 assert(output.data == expectedOutput, 2286 formatAssertMessage( 2287 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 2288 expectedOutput.to!string, output.data.to!string)); 2289 } 2290 } 2291 2292 unittest 2293 { 2294 import std.path : buildPath; 2295 import std.file : rmdirRecurse; 2296 2297 auto testDir = makeUnittestTempDir("tsv_sample"); 2298 scope(exit) testDir.rmdirRecurse; 2299 2300 /* Tabular data sets and expected results use the built-in static seed. 2301 * Tests are run by writing the data set to a file, then calling the main 2302 * routine to process. The function testTsvSample plays the role of the 2303 * main program. Rather than writing to expected output, the results are 2304 * matched against expected. The expected results were verified by hand 2305 * prior to inclusion in the test. 2306 * 2307 * The initial part of this section is simply setting up data files and 2308 * expected results. 2309 * 2310 * Expected results naming conventions: 2311 * - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected 2312 * - Sampling Type (required): Permute (Shuffle), Sample, Replace, Bernoulli, Distinct 2313 * - Compatibility: Compat, AlgoR, Skip, Swap, Inorder 2314 * - Weight Field: Wt<num>, e.g. Wt3 2315 * - Sample Size: Num<num>, eg. Num3 2316 * - Seed Value: V<num>, eg. V77 2317 * - Key Field: K<num>, e.g. K2 2318 * - Probability: P<num>, e.g P05 (5%) 2319 * - Printing Probabilities: Probs 2320 * - Printing Probs in order: ProbsInorder 2321 * - Printing Probs with custom header: RVCustom 2322 */ 2323 2324 /* Empty file. */ 2325 string[][] dataEmpty = []; 2326 string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv"); 2327 writeUnittestTsvFile(fpath_dataEmpty, dataEmpty); 2328 2329 /* 3x0, header only. */ 2330 string[][] data3x0 = [["field_a", "field_b", "field_c"]]; 2331 string fpath_data3x0 = buildPath(testDir, "data3x0.tsv"); 2332 writeUnittestTsvFile(fpath_data3x0, data3x0); 2333 2334 /* 3x1 */ 2335 string[][] data3x1 = 2336 [["field_a", "field_b", "field_c"], 2337 ["tan", "タン", "8.5"]]; 2338 2339 string fpath_data3x1 = buildPath(testDir, "data3x1.tsv"); 2340 string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv"); 2341 writeUnittestTsvFile(fpath_data3x1, data3x1); 2342 writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1 .. $]); 2343 2344 string[][] data3x1ExpectedReplaceNum3 = 2345 [["field_a", "field_b", "field_c"], 2346 ["tan", "タン", "8.5"], 2347 ["tan", "タン", "8.5"], 2348 ["tan", "タン", "8.5"]]; 2349 2350 /* 3x2 */ 2351 string[][] data3x2 = 2352 [["field_a", "field_b", "field_c"], 2353 ["brown", "褐色", "29.2"], 2354 ["gray", "グレー", "6.2"]]; 2355 2356 string fpath_data3x2 = buildPath(testDir, "data3x2.tsv"); 2357 string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv"); 2358 writeUnittestTsvFile(fpath_data3x2, data3x2); 2359 writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1 .. $]); 2360 2361 string[][] data3x2PermuteCompat = 2362 [["field_a", "field_b", "field_c"], 2363 ["gray", "グレー", "6.2"], 2364 ["brown", "褐色", "29.2"]]; 2365 2366 string[][] data3x2PermuteShuffle = 2367 [["field_a", "field_b", "field_c"], 2368 ["gray", "グレー", "6.2"], 2369 ["brown", "褐色", "29.2"]]; 2370 2371 /* 3x3 */ 2372 string[][] data3x3 = 2373 [["field_a", "field_b", "field_c"], 2374 ["orange", "オレンジ", "2.5"], 2375 ["pink", "ピンク", "1.1"], 2376 ["purple", "紫の", "42"]]; 2377 2378 string fpath_data3x3 = buildPath(testDir, "data3x3.tsv"); 2379 string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv"); 2380 writeUnittestTsvFile(fpath_data3x3, data3x3); 2381 writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1 .. $]); 2382 2383 string[][] data3x3ExpectedPermuteCompat = 2384 [["field_a", "field_b", "field_c"], 2385 ["purple", "紫の", "42"], 2386 ["pink", "ピンク", "1.1"], 2387 ["orange", "オレンジ", "2.5"]]; 2388 2389 string[][] data3x3ExpectedPermuteSwap = 2390 [["field_a", "field_b", "field_c"], 2391 ["purple", "紫の", "42"], 2392 ["orange", "オレンジ", "2.5"], 2393 ["pink", "ピンク", "1.1"]]; 2394 2395 /* 3x6 */ 2396 string[][] data3x6 = 2397 [["field_a", "field_b", "field_c"], 2398 ["red", "赤", "23.8"], 2399 ["green", "緑", "0.0072"], 2400 ["white", "白", "1.65"], 2401 ["yellow", "黄", "12"], 2402 ["blue", "青", "12"], 2403 ["black", "黒", "0.983"]]; 2404 string fpath_data3x6 = buildPath(testDir, "data3x6.tsv"); 2405 string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv"); 2406 writeUnittestTsvFile(fpath_data3x6, data3x6); 2407 writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1 .. $]); 2408 2409 // Randomization, all lines 2410 string[][] data3x6ExpectedPermuteCompat = 2411 [["field_a", "field_b", "field_c"], 2412 ["yellow", "黄", "12"], 2413 ["black", "黒", "0.983"], 2414 ["blue", "青", "12"], 2415 ["white", "白", "1.65"], 2416 ["green", "緑", "0.0072"], 2417 ["red", "赤", "23.8"]]; 2418 2419 string[][] data3x6ExpectedPermuteSwap = 2420 [["field_a", "field_b", "field_c"], 2421 ["black", "黒", "0.983"], 2422 ["green", "緑", "0.0072"], 2423 ["red", "赤", "23.8"], 2424 ["yellow", "黄", "12"], 2425 ["white", "白", "1.65"], 2426 ["blue", "青", "12"]]; 2427 2428 string[][] data3x6ExpectedPermuteCompatProbs = 2429 [["random_value", "field_a", "field_b", "field_c"], 2430 ["0.96055546286515892", "yellow", "黄", "12"], 2431 ["0.75710153928957880", "black", "黒", "0.983"], 2432 ["0.52525980887003243", "blue", "青", "12"], 2433 ["0.49287854949943721", "white", "白", "1.65"], 2434 ["0.15929344086907804", "green", "緑", "0.0072"], 2435 ["0.010968807619065046", "red", "赤", "23.8"]]; 2436 2437 /* Note: data3x6ExpectedSampleAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because 2438 * both are effectively the same algorithm given that --num is data length. Both read 2439 * in the full data in order then call randomShuffle. 2440 */ 2441 string[][] data3x6ExpectedSampleAlgoRNum6 = 2442 [["field_a", "field_b", "field_c"], 2443 ["black", "黒", "0.983"], 2444 ["green", "緑", "0.0072"], 2445 ["red", "赤", "23.8"], 2446 ["yellow", "黄", "12"], 2447 ["white", "白", "1.65"], 2448 ["blue", "青", "12"]]; 2449 2450 string[][] data3x6ExpectedSampleAlgoRNum5 = 2451 [["field_a", "field_b", "field_c"], 2452 ["red", "赤", "23.8"], 2453 ["black", "黒", "0.983"], 2454 ["white", "白", "1.65"], 2455 ["green", "緑", "0.0072"], 2456 ["yellow", "黄", "12"]]; 2457 2458 string[][] data3x6ExpectedSampleAlgoRNum4 = 2459 [["field_a", "field_b", "field_c"], 2460 ["blue", "青", "12"], 2461 ["green", "緑", "0.0072"], 2462 ["black", "黒", "0.983"], 2463 ["white", "白", "1.65"]]; 2464 2465 string[][] data3x6ExpectedSampleAlgoRNum3 = 2466 [["field_a", "field_b", "field_c"], 2467 ["red", "赤", "23.8"], 2468 ["black", "黒", "0.983"], 2469 ["green", "緑", "0.0072"]]; 2470 2471 string[][] data3x6ExpectedSampleAlgoRNum2 = 2472 [["field_a", "field_b", "field_c"], 2473 ["black", "黒", "0.983"], 2474 ["red", "赤", "23.8"]]; 2475 2476 string[][] data3x6ExpectedSampleAlgoRNum1 = 2477 [["field_a", "field_b", "field_c"], 2478 ["green", "緑", "0.0072"]]; 2479 2480 /* Inorder versions. */ 2481 string[][] data3x6ExpectedSampleAlgoRNum6Inorder = 2482 [["field_a", "field_b", "field_c"], 2483 ["red", "赤", "23.8"], 2484 ["green", "緑", "0.0072"], 2485 ["white", "白", "1.65"], 2486 ["yellow", "黄", "12"], 2487 ["blue", "青", "12"], 2488 ["black", "黒", "0.983"]]; 2489 2490 string[][] data3x6ExpectedSampleAlgoRNum5Inorder = 2491 [["field_a", "field_b", "field_c"], 2492 ["red", "赤", "23.8"], 2493 ["green", "緑", "0.0072"], 2494 ["white", "白", "1.65"], 2495 ["yellow", "黄", "12"], 2496 ["black", "黒", "0.983"]]; 2497 2498 string[][] data3x6ExpectedSampleAlgoRNum4Inorder = 2499 [["field_a", "field_b", "field_c"], 2500 ["green", "緑", "0.0072"], 2501 ["white", "白", "1.65"], 2502 ["blue", "青", "12"], 2503 ["black", "黒", "0.983"]]; 2504 2505 string[][] data3x6ExpectedSampleAlgoRNum3Inorder = 2506 [["field_a", "field_b", "field_c"], 2507 ["red", "赤", "23.8"], 2508 ["green", "緑", "0.0072"], 2509 ["black", "黒", "0.983"]]; 2510 2511 string[][] data3x6ExpectedSampleAlgoRNum2Inorder = 2512 [["field_a", "field_b", "field_c"], 2513 ["red", "赤", "23.8"], 2514 ["black", "黒", "0.983"]]; 2515 2516 string[][] data3x6ExpectedSampleAlgoRNum1Inorder = 2517 [["field_a", "field_b", "field_c"], 2518 ["green", "緑", "0.0072"]]; 2519 2520 /* Reservoir inorder */ 2521 string[][] data3x6ExpectedSampleCompatNum6Inorder = 2522 [["field_a", "field_b", "field_c"], 2523 ["red", "赤", "23.8"], 2524 ["green", "緑", "0.0072"], 2525 ["white", "白", "1.65"], 2526 ["yellow", "黄", "12"], 2527 ["blue", "青", "12"], 2528 ["black", "黒", "0.983"]]; 2529 2530 string[][] data3x6ExpectedSampleCompatNum5Inorder = 2531 [["field_a", "field_b", "field_c"], 2532 ["green", "緑", "0.0072"], 2533 ["white", "白", "1.65"], 2534 ["yellow", "黄", "12"], 2535 ["blue", "青", "12"], 2536 ["black", "黒", "0.983"]]; 2537 2538 string[][] data3x6ExpectedSampleCompatNum4Inorder = 2539 [["field_a", "field_b", "field_c"], 2540 ["white", "白", "1.65"], 2541 ["yellow", "黄", "12"], 2542 ["blue", "青", "12"], 2543 ["black", "黒", "0.983"]]; 2544 2545 string[][] data3x6ExpectedSampleCompatNum3Inorder = 2546 [["field_a", "field_b", "field_c"], 2547 ["yellow", "黄", "12"], 2548 ["blue", "青", "12"], 2549 ["black", "黒", "0.983"]]; 2550 2551 string[][] data3x6ExpectedSampleCompatNum2Inorder = 2552 [["field_a", "field_b", "field_c"], 2553 ["yellow", "黄", "12"], 2554 ["black", "黒", "0.983"]]; 2555 2556 string[][] data3x6ExpectedSampleCompatNum1Inorder = 2557 [["field_a", "field_b", "field_c"], 2558 ["yellow", "黄", "12"]]; 2559 2560 2561 /* Reservoir inorder with probabilities. */ 2562 string[][] data3x6ExpectedSampleCompatNum6ProbsInorder = 2563 [["random_value", "field_a", "field_b", "field_c"], 2564 ["0.010968807619065046", "red", "赤", "23.8"], 2565 ["0.15929344086907804", "green", "緑", "0.0072"], 2566 ["0.49287854949943721", "white", "白", "1.65"], 2567 ["0.96055546286515892", "yellow", "黄", "12"], 2568 ["0.52525980887003243", "blue", "青", "12"], 2569 ["0.75710153928957880", "black", "黒", "0.983"]]; 2570 2571 string[][] data3x6ExpectedSampleCompatNum5ProbsInorder = 2572 [["random_value", "field_a", "field_b", "field_c"], 2573 ["0.15929344086907804", "green", "緑", "0.0072"], 2574 ["0.49287854949943721", "white", "白", "1.65"], 2575 ["0.96055546286515892", "yellow", "黄", "12"], 2576 ["0.52525980887003243", "blue", "青", "12"], 2577 ["0.75710153928957880", "black", "黒", "0.983"]]; 2578 2579 string[][] data3x6ExpectedSampleCompatNum4ProbsInorder = 2580 [["random_value", "field_a", "field_b", "field_c"], 2581 ["0.49287854949943721", "white", "白", "1.65"], 2582 ["0.96055546286515892", "yellow", "黄", "12"], 2583 ["0.52525980887003243", "blue", "青", "12"], 2584 ["0.75710153928957880", "black", "黒", "0.983"]]; 2585 2586 string[][] data3x6ExpectedSampleCompatNum3ProbsInorder = 2587 [["random_value", "field_a", "field_b", "field_c"], 2588 ["0.96055546286515892", "yellow", "黄", "12"], 2589 ["0.52525980887003243", "blue", "青", "12"], 2590 ["0.75710153928957880", "black", "黒", "0.983"]]; 2591 2592 string[][] data3x6ExpectedSampleCompatNum2ProbsInorder = 2593 [["random_value", "field_a", "field_b", "field_c"], 2594 ["0.96055546286515892", "yellow", "黄", "12"], 2595 ["0.75710153928957880", "black", "黒", "0.983"]]; 2596 2597 string[][] data3x6ExpectedSampleCompatNum1ProbsInorder = 2598 [["random_value", "field_a", "field_b", "field_c"], 2599 ["0.96055546286515892", "yellow", "黄", "12"]]; 2600 2601 string[][] data3x6ExpectedWt3Num6Inorder = 2602 [["field_a", "field_b", "field_c"], 2603 ["red", "赤", "23.8"], 2604 ["green", "緑", "0.0072"], 2605 ["white", "白", "1.65"], 2606 ["yellow", "黄", "12"], 2607 ["blue", "青", "12"], 2608 ["black", "黒", "0.983"]]; 2609 2610 string[][] data3x6ExpectedWt3Num5Inorder = 2611 [["field_a", "field_b", "field_c"], 2612 ["green", "緑", "0.0072"], 2613 ["white", "白", "1.65"], 2614 ["yellow", "黄", "12"], 2615 ["blue", "青", "12"], 2616 ["black", "黒", "0.983"]]; 2617 2618 string[][] data3x6ExpectedWt3Num4Inorder = 2619 [["field_a", "field_b", "field_c"], 2620 ["white", "白", "1.65"], 2621 ["yellow", "黄", "12"], 2622 ["blue", "青", "12"], 2623 ["black", "黒", "0.983"]]; 2624 2625 string[][] data3x6ExpectedWt3Num3Inorder = 2626 [["field_a", "field_b", "field_c"], 2627 ["yellow", "黄", "12"], 2628 ["blue", "青", "12"], 2629 ["black", "黒", "0.983"]]; 2630 2631 string[][] data3x6ExpectedWt3Num2Inorder = 2632 [["field_a", "field_b", "field_c"], 2633 ["yellow", "黄", "12"], 2634 ["black", "黒", "0.983"]]; 2635 2636 string[][] data3x6ExpectedWt3Num1Inorder = 2637 [["field_a", "field_b", "field_c"], 2638 ["yellow", "黄", "12"]]; 2639 2640 2641 string[][] data3x6ExpectedBernoulliProbsP100 = 2642 [["random_value", "field_a", "field_b", "field_c"], 2643 ["0.010968807619065046", "red", "赤", "23.8"], 2644 ["0.15929344086907804", "green", "緑", "0.0072"], 2645 ["0.49287854949943721", "white", "白", "1.65"], 2646 ["0.96055546286515892", "yellow", "黄", "12"], 2647 ["0.52525980887003243", "blue", "青", "12"], 2648 ["0.75710153928957880", "black", "黒", "0.983"]]; 2649 2650 string[][] data3x6ExpectedBernoulliCompatProbsP60 = 2651 [["random_value", "field_a", "field_b", "field_c"], 2652 ["0.010968807619065046", "red", "赤", "23.8"], 2653 ["0.15929344086907804", "green", "緑", "0.0072"], 2654 ["0.49287854949943721", "white", "白", "1.65"], 2655 ["0.52525980887003243", "blue", "青", "12"]]; 2656 2657 string[][] data3x6ExpectedBernoulliSkipP40 = 2658 [["field_a", "field_b", "field_c"], 2659 ["red", "赤", "23.8"], 2660 ["green", "緑", "0.0072"], 2661 ["yellow", "黄", "12"]]; 2662 2663 string[][] data3x6ExpectedBernoulliCompatP60 = 2664 [["field_a", "field_b", "field_c"], 2665 ["red", "赤", "23.8"], 2666 ["green", "緑", "0.0072"], 2667 ["white", "白", "1.65"], 2668 ["blue", "青", "12"]]; 2669 2670 string[][] data3x6ExpectedDistinctK1K3P60 = 2671 [["field_a", "field_b", "field_c"], 2672 ["green", "緑", "0.0072"], 2673 ["white", "白", "1.65"], 2674 ["blue", "青", "12"]]; 2675 2676 string[][] data3x6ExpectedDistinctK1K3P60Probs = 2677 [["random_value", "field_a", "field_b", "field_c"], 2678 ["0", "green", "緑", "0.0072"], 2679 ["0", "white", "白", "1.65"], 2680 ["0", "blue", "青", "12"]]; 2681 2682 string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom = 2683 [["custom_random_value_header", "field_a", "field_b", "field_c"], 2684 ["0", "green", "緑", "0.0072"], 2685 ["0", "white", "白", "1.65"], 2686 ["0", "blue", "青", "12"]]; 2687 2688 string[][] data3x6ExpectedDistinctK2P2ProbsInorder = 2689 [["random_value", "field_a", "field_b", "field_c"], 2690 ["1", "red", "赤", "23.8"], 2691 ["0", "green", "緑", "0.0072"], 2692 ["0", "white", "白", "1.65"], 2693 ["1", "yellow", "黄", "12"], 2694 ["3", "blue", "青", "12"], 2695 ["2", "black", "黒", "0.983"]]; 2696 2697 string[][] data3x6ExpectedPermuteWt3Probs = 2698 [["random_value", "field_a", "field_b", "field_c"], 2699 ["0.99665198757645390", "yellow", "黄", "12"], 2700 ["0.94775884809836686", "blue", "青", "12"], 2701 ["0.82728234682286661", "red", "赤", "23.8"], 2702 ["0.75346697377181959", "black", "黒", "0.983"], 2703 ["0.65130103496422487", "white", "白", "1.65"], 2704 ["1.5636943712879866e-111", "green", "緑", "0.0072"]]; 2705 2706 string[][] data3x6ExpectedWt3ProbsInorder = 2707 [["random_value", "field_a", "field_b", "field_c"], 2708 ["0.82728234682286661", "red", "赤", "23.8"], 2709 ["1.5636943712879866e-111", "green", "緑", "0.0072"], 2710 ["0.65130103496422487", "white", "白", "1.65"], 2711 ["0.99665198757645390", "yellow", "黄", "12"], 2712 ["0.94775884809836686", "blue", "青", "12"], 2713 ["0.75346697377181959", "black", "黒", "0.983"]]; 2714 2715 string[][] data3x6ExpectedPermuteWt3 = 2716 [["field_a", "field_b", "field_c"], 2717 ["yellow", "黄", "12"], 2718 ["blue", "青", "12"], 2719 ["red", "赤", "23.8"], 2720 ["black", "黒", "0.983"], 2721 ["white", "白", "1.65"], 2722 ["green", "緑", "0.0072"]]; 2723 2724 2725 string[][] data3x6ExpectedReplaceNum10 = 2726 [["field_a", "field_b", "field_c"], 2727 ["black", "黒", "0.983"], 2728 ["green", "緑", "0.0072"], 2729 ["green", "緑", "0.0072"], 2730 ["red", "赤", "23.8"], 2731 ["yellow", "黄", "12"], 2732 ["red", "赤", "23.8"], 2733 ["white", "白", "1.65"], 2734 ["yellow", "黄", "12"], 2735 ["yellow", "黄", "12"], 2736 ["white", "白", "1.65"], 2737 ]; 2738 2739 string[][] data3x6ExpectedReplaceNum10V77 = 2740 [["field_a", "field_b", "field_c"], 2741 ["black", "黒", "0.983"], 2742 ["red", "赤", "23.8"], 2743 ["black", "黒", "0.983"], 2744 ["yellow", "黄", "12"], 2745 ["green", "緑", "0.0072"], 2746 ["green", "緑", "0.0072"], 2747 ["green", "緑", "0.0072"], 2748 ["yellow", "黄", "12"], 2749 ["blue", "青", "12"], 2750 ["white", "白", "1.65"], 2751 ]; 2752 2753 /* Using a different static seed. */ 2754 string[][] data3x6ExpectedPermuteCompatV41Probs = 2755 [["random_value", "field_a", "field_b", "field_c"], 2756 ["0.68057272653095424", "green", "緑", "0.0072"], 2757 ["0.67681624367833138", "blue", "青", "12"], 2758 ["0.32097338931635022", "yellow", "黄", "12"], 2759 ["0.25092361867427826", "red", "赤", "23.8"], 2760 ["0.15535934292711318", "black", "黒", "0.983"], 2761 ["0.046095821075141430", "white", "白", "1.65"]]; 2762 2763 string[][] data3x6ExpectedBernoulliCompatP60V41Probs = 2764 [["random_value", "field_a", "field_b", "field_c"], 2765 ["0.25092361867427826", "red", "赤", "23.8"], 2766 ["0.046095821075141430", "white", "白", "1.65"], 2767 ["0.32097338931635022", "yellow", "黄", "12"], 2768 ["0.15535934292711318", "black", "黒", "0.983"]]; 2769 2770 string[][] data3x6ExpectedPermuteWt3V41Probs = 2771 [["random_value", "field_a", "field_b", "field_c"], 2772 ["0.96799377498910666", "blue", "青", "12"], 2773 ["0.94356245792573568", "red", "赤", "23.8"], 2774 ["0.90964601024271996", "yellow", "黄", "12"], 2775 ["0.15491658409260103", "white", "白", "1.65"], 2776 ["0.15043620392537033", "black", "黒", "0.983"], 2777 ["6.1394674830701461e-24", "green", "緑", "0.0072"]]; 2778 2779 string[][] data3x6ExpectedWt3V41ProbsInorder = 2780 [["random_value", "field_a", "field_b", "field_c"], 2781 ["0.94356245792573568", "red", "赤", "23.8"], 2782 ["6.1394674830701461e-24", "green", "緑", "0.0072"], 2783 ["0.15491658409260103", "white", "白", "1.65"], 2784 ["0.90964601024271996", "yellow", "黄", "12"], 2785 ["0.96799377498910666", "blue", "青", "12"], 2786 ["0.15043620392537033", "black", "黒", "0.983"]]; 2787 2788 2789 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 2790 string[][] combo1ExpectedPermuteCompat = 2791 [["field_a", "field_b", "field_c"], 2792 ["yellow", "黄", "12"], 2793 ["tan", "タン", "8.5"], 2794 ["brown", "褐色", "29.2"], 2795 ["green", "緑", "0.0072"], 2796 ["red", "赤", "23.8"], 2797 ["purple", "紫の", "42"], 2798 ["black", "黒", "0.983"], 2799 ["white", "白", "1.65"], 2800 ["gray", "グレー", "6.2"], 2801 ["blue", "青", "12"], 2802 ["pink", "ピンク", "1.1"], 2803 ["orange", "オレンジ", "2.5"]]; 2804 2805 string[][] combo1ExpectedPermuteCompatProbs = 2806 [["random_value", "field_a", "field_b", "field_c"], 2807 ["0.97088520275428891", "yellow", "黄", "12"], 2808 ["0.96055546286515892", "tan", "タン", "8.5"], 2809 ["0.81756894313730299", "brown", "褐色", "29.2"], 2810 ["0.75710153928957880", "green", "緑", "0.0072"], 2811 ["0.52525980887003243", "red", "赤", "23.8"], 2812 ["0.49287854949943721", "purple", "紫の", "42"], 2813 ["0.47081507067196071", "black", "黒", "0.983"], 2814 ["0.38388182921335101", "white", "白", "1.65"], 2815 ["0.29215990612283349", "gray", "グレー", "6.2"], 2816 ["0.24033216014504433", "blue", "青", "12"], 2817 ["0.15929344086907804", "pink", "ピンク", "1.1"], 2818 ["0.010968807619065046", "orange", "オレンジ", "2.5"]]; 2819 2820 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 2821 string[][] combo1ExpectedProbsInorder = 2822 [["random_value", "field_a", "field_b", "field_c"], 2823 ["0.010968807619065046", "orange", "オレンジ", "2.5"], 2824 ["0.15929344086907804", "pink", "ピンク", "1.1"], 2825 ["0.49287854949943721", "purple", "紫の", "42"], 2826 ["0.96055546286515892", "tan", "タン", "8.5"], 2827 ["0.52525980887003243", "red", "赤", "23.8"], 2828 ["0.75710153928957880", "green", "緑", "0.0072"], 2829 ["0.38388182921335101", "white", "白", "1.65"], 2830 ["0.97088520275428891", "yellow", "黄", "12"], 2831 ["0.24033216014504433", "blue", "青", "12"], 2832 ["0.47081507067196071", "black", "黒", "0.983"], 2833 ["0.81756894313730299", "brown", "褐色", "29.2"], 2834 ["0.29215990612283349", "gray", "グレー", "6.2"]]; 2835 2836 string[][] combo1ExpectedBernoulliCompatP50Probs = 2837 [["random_value", "field_a", "field_b", "field_c"], 2838 ["0.010968807619065046", "orange", "オレンジ", "2.5"], 2839 ["0.15929344086907804", "pink", "ピンク", "1.1"], 2840 ["0.49287854949943721", "purple", "紫の", "42"], 2841 ["0.38388182921335101", "white", "白", "1.65"], 2842 ["0.24033216014504433", "blue", "青", "12"], 2843 ["0.47081507067196071", "black", "黒", "0.983"], 2844 ["0.29215990612283349", "gray", "グレー", "6.2"]]; 2845 2846 string[][] combo1ExpectedBernoulliCompatP40 = 2847 [["field_a", "field_b", "field_c"], 2848 ["orange", "オレンジ", "2.5"], 2849 ["pink", "ピンク", "1.1"], 2850 ["white", "白", "1.65"], 2851 ["blue", "青", "12"], 2852 ["gray", "グレー", "6.2"]]; 2853 2854 string[][] combo1ExpectedDistinctK1P40 = 2855 [["field_a", "field_b", "field_c"], 2856 ["orange", "オレンジ", "2.5"], 2857 ["red", "赤", "23.8"], 2858 ["green", "緑", "0.0072"], 2859 ["blue", "青", "12"], 2860 ["black", "黒", "0.983"]]; 2861 2862 string[][] combo1ExpectedPermuteWt3Probs = 2863 [["random_value", "field_a", "field_b", "field_c"], 2864 ["0.99754077523718754", "yellow", "黄", "12"], 2865 ["0.99527665440088786", "tan", "タン", "8.5"], 2866 ["0.99312578945741659", "brown", "褐色", "29.2"], 2867 ["0.98329602553389361", "purple", "紫の", "42"], 2868 ["0.97330961938083660", "red", "赤", "23.8"], 2869 ["0.88797551521739648", "blue", "青", "12"], 2870 ["0.81999230489041786", "gray", "グレー", "6.2"], 2871 ["0.55975569204250941", "white", "白", "1.65"], 2872 ["0.46472135609205739", "black", "黒", "0.983"], 2873 ["0.18824582704191337", "pink", "ピンク", "1.1"], 2874 ["0.16446131853299920", "orange", "オレンジ", "2.5"], 2875 ["1.6438086931020549e-17", "green", "緑", "0.0072"]]; 2876 2877 string[][] combo1ExpectedPermuteWt3 = 2878 [["field_a", "field_b", "field_c"], 2879 ["yellow", "黄", "12"], 2880 ["tan", "タン", "8.5"], 2881 ["brown", "褐色", "29.2"], 2882 ["purple", "紫の", "42"], 2883 ["red", "赤", "23.8"], 2884 ["blue", "青", "12"], 2885 ["gray", "グレー", "6.2"], 2886 ["white", "白", "1.65"], 2887 ["black", "黒", "0.983"], 2888 ["pink", "ピンク", "1.1"], 2889 ["orange", "オレンジ", "2.5"], 2890 ["green", "緑", "0.0072"]]; 2891 2892 string[][] combo1ExpectedSampleAlgoRNum4 = 2893 [["field_a", "field_b", "field_c"], 2894 ["blue", "青", "12"], 2895 ["gray", "グレー", "6.2"], 2896 ["brown", "褐色", "29.2"], 2897 ["white", "白", "1.65"]]; 2898 2899 string[][] combo1ExpectedSampleAlgoRNum4Inorder = 2900 [["field_a", "field_b", "field_c"], 2901 ["white", "白", "1.65"], 2902 ["blue", "青", "12"], 2903 ["brown", "褐色", "29.2"], 2904 ["gray", "グレー", "6.2"]]; 2905 2906 string[][] combo1ExpectedReplaceNum10 = 2907 [["field_a", "field_b", "field_c"], 2908 ["gray", "グレー", "6.2"], 2909 ["yellow", "黄", "12"], 2910 ["yellow", "黄", "12"], 2911 ["white", "白", "1.65"], 2912 ["tan", "タン", "8.5"], 2913 ["white", "白", "1.65"], 2914 ["blue", "青", "12"], 2915 ["black", "黒", "0.983"], 2916 ["tan", "タン", "8.5"], 2917 ["purple", "紫の", "42"]]; 2918 2919 /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */ 2920 string[][] data1x200 = 2921 [["field_a"], 2922 ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"], 2923 ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"], 2924 ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"], 2925 ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"], 2926 ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"], 2927 ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"], 2928 ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"], 2929 ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"], 2930 ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"], 2931 ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"], 2932 ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"], 2933 ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"], 2934 ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"], 2935 ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"], 2936 ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"], 2937 ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"], 2938 ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"], 2939 ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"], 2940 ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"], 2941 ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"], 2942 ]; 2943 2944 string fpath_data1x200 = buildPath(testDir, "data1x200.tsv"); 2945 string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv"); 2946 writeUnittestTsvFile(fpath_data1x200, data1x200); 2947 writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1 .. $]); 2948 2949 string[][] data1x200ExpectedBernoulliSkipV333P01 = 2950 [["field_a"], 2951 ["077"], 2952 ["119"]]; 2953 2954 string[][] data1x200ExpectedBernoulliSkipV333P02 = 2955 [["field_a"], 2956 ["038"], 2957 ["059"], 2958 ["124"], 2959 ["161"], 2960 ["162"], 2961 ["183"]]; 2962 2963 string[][] data1x200ExpectedBernoulliSkipV333P03 = 2964 [["field_a"], 2965 ["025"], 2966 ["039"], 2967 ["082"], 2968 ["107"], 2969 ["108"], 2970 ["122"], 2971 ["136"], 2972 ["166"], 2973 ["182"]]; 2974 2975 string[][] data1x200ExpectedBernoulliCompatV333P01 = 2976 [["field_a"], 2977 ["072"]]; 2978 2979 string[][] data1x200ExpectedBernoulliCompatV333P02 = 2980 [["field_a"], 2981 ["004"], 2982 ["072"]]; 2983 2984 string[][] data1x200ExpectedBernoulliCompatV333P03 = 2985 [["field_a"], 2986 ["004"], 2987 ["072"], 2988 ["181"]]; 2989 2990 /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files, 2991 * only expected results. The header is from 3x0, the results are offset 1-position 2992 * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line. 2993 */ 2994 string[][] combo2ExpectedBernoulliSkipV333P03 = 2995 [["field_a", "field_b", "field_c"], 2996 ["024"], 2997 ["038"], 2998 ["081"], 2999 ["106"], 3000 ["107"], 3001 ["121"], 3002 ["135"], 3003 ["165"], 3004 ["181"]]; 3005 3006 3007 /* 1x10 - Simple 1-column file. */ 3008 string[][] data1x10 = 3009 [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]]; 3010 string fpath_data1x10 = buildPath(testDir, "data1x10.tsv"); 3011 string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv"); 3012 writeUnittestTsvFile(fpath_data1x10, data1x10); 3013 writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1 .. $]); 3014 3015 string[][] data1x10ExpectedPermuteCompat = 3016 [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]]; 3017 3018 string[][] data1x10ExpectedPermuteWt1 = 3019 [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]]; 3020 3021 /* 2x10a - Uniform distribution [0,1]. */ 3022 string[][] data2x10a = 3023 [["line", "weight"], 3024 ["1", "0.26788837"], 3025 ["2", "0.06601298"], 3026 ["3", "0.38627527"], 3027 ["4", "0.47379424"], 3028 ["5", "0.02966641"], 3029 ["6", "0.05636231"], 3030 ["7", "0.70529242"], 3031 ["8", "0.91836862"], 3032 ["9", "0.99103720"], 3033 ["10", "0.31401740"]]; 3034 3035 string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv"); 3036 writeUnittestTsvFile(fpath_data2x10a, data2x10a); 3037 3038 string[][] data2x10aExpectedPermuteWt2Probs = 3039 [["random_value", "line", "weight"], 3040 ["0.96833865494543658", "8", "0.91836862"], 3041 ["0.91856842054413923", "4", "0.47379424"], 3042 ["0.25730832087795091", "7", "0.70529242"], 3043 ["0.23725317907018120", "9", "0.99103720"], 3044 ["0.16016096701872204", "3", "0.38627527"], 3045 ["0.090819662667243381", "10", "0.31401740"], 3046 ["0.0071764539244361172", "6", "0.05636231"], 3047 ["0.000000048318642951630057", "1", "0.26788837"], 3048 ["0.00000000037525692966535517", "5", "0.02966641"], 3049 ["8.2123247880095796e-13", "2", "0.06601298"]]; 3050 3051 /* 2x10b - Uniform distribution [0,1000]. */ 3052 string[][] data2x10b = 3053 [["line", "weight"], 3054 ["1", "761"], 3055 ["2", "432"], 3056 ["3", "103"], 3057 ["4", "448"], 3058 ["5", "750"], 3059 ["6", "711"], 3060 ["7", "867"], 3061 ["8", "841"], 3062 ["9", "963"], 3063 ["10", "784"]]; 3064 3065 string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv"); 3066 writeUnittestTsvFile(fpath_data2x10b, data2x10b); 3067 3068 string[][] data2x10bExpectedPermuteWt2Probs = 3069 [["random_value", "line", "weight"], 3070 ["0.99996486739067969", "8", "841"], 3071 ["0.99991017467137211", "4", "448"], 3072 ["0.99960871524873662", "6", "711"], 3073 ["0.99914188537143800", "5", "750"], 3074 ["0.99903963250274785", "10", "784"], 3075 ["0.99889631825931946", "7", "867"], 3076 ["0.99852058315191139", "9", "963"], 3077 ["0.99575669679158918", "2", "432"], 3078 ["0.99408758732050595", "1", "761"], 3079 ["0.99315467761212362", "3", "103"]]; 3080 3081 /* 2x10c - Logarithmic distribution in random order. */ 3082 string[][] data2x10c = 3083 [["line", "weight"], 3084 ["1", "31.85"], 3085 ["2", "17403.31"], 3086 ["3", "653.84"], 3087 ["4", "8.23"], 3088 ["5", "2671.04"], 3089 ["6", "26226.08"], 3090 ["7", "1.79"], 3091 ["8", "354.56"], 3092 ["9", "35213.81"], 3093 ["10", "679.29"]]; 3094 3095 string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv"); 3096 writeUnittestTsvFile(fpath_data2x10c, data2x10c); 3097 3098 string[][] data2x10cExpectedPermuteWt2Probs = 3099 [["random_value", "line", "weight"], 3100 ["0.99998939008709697", "6", "26226.08"], 3101 ["0.99995951291695517", "9", "35213.81"], 3102 ["0.99991666907613541", "8", "354.56"], 3103 ["0.99989445052186410", "2", "17403.31"], 3104 ["0.99975897602861630", "5", "2671.04"], 3105 ["0.99891852769877643", "3", "653.84"], 3106 ["0.99889167752782515", "10", "679.29"], 3107 ["0.99512207506850148", "4", "8.23"], 3108 ["0.86789371584259023", "1", "31.85"], 3109 ["0.58574438162915610", "7", "1.79"]]; 3110 3111 /* 2x10d. Logarithmic distribution in ascending order. */ 3112 string[][] data2x10d = 3113 [["line", "weight"], 3114 ["1", "1.79"], 3115 ["2", "8.23"], 3116 ["3", "31.85"], 3117 ["4", "354.56"], 3118 ["5", "653.84"], 3119 ["6", "679.29"], 3120 ["7", "2671.04"], 3121 ["8", "17403.31"], 3122 ["9", "26226.08"], 3123 ["10", "35213.81"]]; 3124 3125 string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv"); 3126 writeUnittestTsvFile(fpath_data2x10d, data2x10d); 3127 3128 string[][] data2x10dExpectedPermuteWt2Probs = 3129 [["random_value", "line", "weight"], 3130 ["0.99999830221846353", "8", "17403.31"], 3131 ["0.99997860834041397", "10", "35213.81"], 3132 ["0.99994563828986716", "9", "26226.08"], 3133 ["0.99988650363575737", "4", "354.56"], 3134 ["0.99964161939190088", "7", "2671.04"], 3135 ["0.99959045338948649", "6", "679.29"], 3136 ["0.99901574490639788", "5", "653.84"], 3137 ["0.97803163304747431", "3", "31.85"], 3138 ["0.79994791806910948", "2", "8.23"], 3139 ["0.080374261239949119", "1", "1.79"]]; 3140 3141 /* 2x10e. Logarithmic distribution in descending order. */ 3142 string[][] data2x10e = 3143 [["line", "weight"], 3144 ["1", "35213.81"], 3145 ["2", "26226.08"], 3146 ["3", "17403.31"], 3147 ["4", "2671.04"], 3148 ["5", "679.29"], 3149 ["6", "653.84"], 3150 ["7", "354.56"], 3151 ["8", "31.85"], 3152 ["9", "8.23"], 3153 ["10", "1.79"]]; 3154 string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv"); 3155 writeUnittestTsvFile(fpath_data2x10e, data2x10e); 3156 3157 string[][] data2x10eExpectedPermuteWt2Probs = 3158 [["random_value", "line", "weight"], 3159 ["0.99998493348975237", "4", "2671.04"], 3160 ["0.99995934807202624", "3", "17403.31"], 3161 ["0.99992995739727453", "2", "26226.08"], 3162 ["0.99987185679245649", "1", "35213.81"], 3163 ["0.99957451563173938", "6", "653.84"], 3164 ["0.99907273650209583", "8", "31.85"], 3165 ["0.99905260312968946", "5", "679.29"], 3166 ["0.99730333650516401", "7", "354.56"], 3167 ["0.84093902435227808", "9", "8.23"], 3168 ["0.65650015926290028", "10", "1.79"]]; 3169 3170 /* Data sets for distinct sampling. */ 3171 string[][] data5x25 = 3172 [["ID", "Shape", "Color", "Size", "Weight"], 3173 ["01", "circle", "red", "S", "10"], 3174 ["02", "circle", "black", "L", "20"], 3175 ["03", "square", "black", "L", "20"], 3176 ["04", "circle", "green", "L", "30"], 3177 ["05", "ellipse", "red", "S", "20"], 3178 ["06", "triangle", "red", "S", "10"], 3179 ["07", "triangle", "red", "L", "20"], 3180 ["08", "square", "black", "S", "10"], 3181 ["09", "circle", "black", "S", "20"], 3182 ["10", "square", "green", "L", "20"], 3183 ["11", "triangle", "red", "L", "20"], 3184 ["12", "circle", "green", "L", "30"], 3185 ["13", "ellipse", "red", "S", "20"], 3186 ["14", "circle", "green", "L", "30"], 3187 ["15", "ellipse", "red", "L", "30"], 3188 ["16", "square", "red", "S", "10"], 3189 ["17", "circle", "black", "L", "20"], 3190 ["18", "square", "red", "S", "20"], 3191 ["19", "square", "black", "L", "20"], 3192 ["20", "circle", "red", "S", "10"], 3193 ["21", "ellipse", "black", "L", "30"], 3194 ["22", "triangle", "red", "L", "30"], 3195 ["23", "circle", "green", "S", "20"], 3196 ["24", "square", "green", "L", "20"], 3197 ["25", "circle", "red", "S", "10"], 3198 ]; 3199 3200 string fpath_data5x25 = buildPath(testDir, "data5x25.tsv"); 3201 string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv"); 3202 writeUnittestTsvFile(fpath_data5x25, data5x25); 3203 writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1 .. $]); 3204 3205 string[][] data5x25ExpectedDistinctK2P40 = 3206 [["ID", "Shape", "Color", "Size", "Weight"], 3207 ["03", "square", "black", "L", "20"], 3208 ["05", "ellipse", "red", "S", "20"], 3209 ["08", "square", "black", "S", "10"], 3210 ["10", "square", "green", "L", "20"], 3211 ["13", "ellipse", "red", "S", "20"], 3212 ["15", "ellipse", "red", "L", "30"], 3213 ["16", "square", "red", "S", "10"], 3214 ["18", "square", "red", "S", "20"], 3215 ["19", "square", "black", "L", "20"], 3216 ["21", "ellipse", "black", "L", "30"], 3217 ["24", "square", "green", "L", "20"], 3218 ]; 3219 3220 string[][] data5x25ExpectedDistinctK2K4P20 = 3221 [["ID", "Shape", "Color", "Size", "Weight"], 3222 ["03", "square", "black", "L", "20"], 3223 ["07", "triangle", "red", "L", "20"], 3224 ["08", "square", "black", "S", "10"], 3225 ["10", "square", "green", "L", "20"], 3226 ["11", "triangle", "red", "L", "20"], 3227 ["16", "square", "red", "S", "10"], 3228 ["18", "square", "red", "S", "20"], 3229 ["19", "square", "black", "L", "20"], 3230 ["22", "triangle", "red", "L", "30"], 3231 ["24", "square", "green", "L", "20"], 3232 ]; 3233 3234 string[][] data5x25ExpectedDistinctK2K3K4P20 = 3235 [["ID", "Shape", "Color", "Size", "Weight"], 3236 ["04", "circle", "green", "L", "30"], 3237 ["07", "triangle", "red", "L", "20"], 3238 ["09", "circle", "black", "S", "20"], 3239 ["11", "triangle", "red", "L", "20"], 3240 ["12", "circle", "green", "L", "30"], 3241 ["14", "circle", "green", "L", "30"], 3242 ["16", "square", "red", "S", "10"], 3243 ["18", "square", "red", "S", "20"], 3244 ["22", "triangle", "red", "L", "30"], 3245 ]; 3246 3247 /* Fields 2 and 4 from data5x25. Distinct rows should be the same for equiv keys. */ 3248 string[][] data2x25 = 3249 [["Shape", "Size"], 3250 ["circle", "S"], 3251 ["circle", "L"], 3252 ["square", "L"], 3253 ["circle", "L"], 3254 ["ellipse", "S"], 3255 ["triangle", "S"], 3256 ["triangle", "L"], 3257 ["square", "S"], 3258 ["circle", "S"], 3259 ["square", "L"], 3260 ["triangle", "L"], 3261 ["circle", "L"], 3262 ["ellipse", "S"], 3263 ["circle", "L"], 3264 ["ellipse", "L"], 3265 ["square", "S"], 3266 ["circle", "L"], 3267 ["square", "S"], 3268 ["square", "L"], 3269 ["circle", "S"], 3270 ["ellipse", "L"], 3271 ["triangle", "L"], 3272 ["circle", "S"], 3273 ["square", "L"], 3274 ["circle", "S"], 3275 ]; 3276 3277 string fpath_data2x25 = buildPath(testDir, "data2x25.tsv"); 3278 string fpath_data2x25_noheader = buildPath(testDir, "data2x25_noheader.tsv"); 3279 writeUnittestTsvFile(fpath_data2x25, data2x25); 3280 writeUnittestTsvFile(fpath_data2x25_noheader, data2x25[1 .. $]); 3281 3282 string[][] data2x25ExpectedDistinctK1K2P20 = 3283 [["Shape", "Size"], 3284 ["square", "L"], 3285 ["triangle", "L"], 3286 ["square", "S"], 3287 ["square", "L"], 3288 ["triangle", "L"], 3289 ["square", "S"], 3290 ["square", "S"], 3291 ["square", "L"], 3292 ["triangle", "L"], 3293 ["square", "L"], 3294 ]; 3295 3296 string[][] data1x25 = 3297 [["Shape-Size"], 3298 ["circle-S"], 3299 ["circle-L"], 3300 ["square-L"], 3301 ["circle-L"], 3302 ["ellipse-S"], 3303 ["triangle-S"], 3304 ["triangle-L"], 3305 ["square-S"], 3306 ["circle-S"], 3307 ["square-L"], 3308 ["triangle-L"], 3309 ["circle-L"], 3310 ["ellipse-S"], 3311 ["circle-L"], 3312 ["ellipse-L"], 3313 ["square-S"], 3314 ["circle-L"], 3315 ["square-S"], 3316 ["square-L"], 3317 ["circle-S"], 3318 ["ellipse-L"], 3319 ["triangle-L"], 3320 ["circle-S"], 3321 ["square-L"], 3322 ["circle-S"], 3323 ]; 3324 3325 string fpath_data1x25 = buildPath(testDir, "data1x25.tsv"); 3326 string fpath_data1x25_noheader = buildPath(testDir, "data1x25_noheader.tsv"); 3327 writeUnittestTsvFile(fpath_data1x25, data1x25); 3328 writeUnittestTsvFile(fpath_data1x25_noheader, data1x25[1 .. $]); 3329 3330 string[][] data1x25ExpectedDistinctK1P20 = 3331 [["Shape-Size"], 3332 ["triangle-L"], 3333 ["square-S"], 3334 ["triangle-L"], 3335 ["ellipse-L"], 3336 ["square-S"], 3337 ["square-S"], 3338 ["ellipse-L"], 3339 ["triangle-L"], 3340 ]; 3341 3342 string[][] data1x25ExpectedDistinctK1P20Probs = 3343 [["random_value", "Shape-Size"], 3344 ["0", "triangle-L"], 3345 ["0", "square-S"], 3346 ["0", "triangle-L"], 3347 ["0", "ellipse-L"], 3348 ["0", "square-S"], 3349 ["0", "square-S"], 3350 ["0", "ellipse-L"], 3351 ["0", "triangle-L"], 3352 ]; 3353 3354 string[][] data1x25ExpectedDistinctK1P20ProbsInorder = 3355 [["random_value", "Shape-Size"], 3356 ["1", "circle-S"], 3357 ["4", "circle-L"], 3358 ["2", "square-L"], 3359 ["4", "circle-L"], 3360 ["2", "ellipse-S"], 3361 ["1", "triangle-S"], 3362 ["0", "triangle-L"], 3363 ["0", "square-S"], 3364 ["1", "circle-S"], 3365 ["2", "square-L"], 3366 ["0", "triangle-L"], 3367 ["4", "circle-L"], 3368 ["2", "ellipse-S"], 3369 ["4", "circle-L"], 3370 ["0", "ellipse-L"], 3371 ["0", "square-S"], 3372 ["4", "circle-L"], 3373 ["0", "square-S"], 3374 ["2", "square-L"], 3375 ["1", "circle-S"], 3376 ["0", "ellipse-L"], 3377 ["0", "triangle-L"], 3378 ["1", "circle-S"], 3379 ["2", "square-L"], 3380 ["1", "circle-S"], 3381 ]; 3382 3383 /* 3384 * Enough setup! Actually run some tests! 3385 */ 3386 3387 /* Shuffling tests. Headers, static seed, compatibility mode. With weights and without. */ 3388 testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty); 3389 testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0); 3390 testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1); 3391 testTsvSample(["test-a4", "-H", "-s", "--compatibility-mode", fpath_data3x2], data3x2PermuteCompat); 3392 testTsvSample(["test-a5", "-H", "-s", "--compatibility-mode", fpath_data3x3], data3x3ExpectedPermuteCompat); 3393 testTsvSample(["test-a6", "-H", "-s", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat); 3394 testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs); 3395 testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3); 3396 testTsvSample(["test-a8b", "-H", "-s", "--weight-field", "field_c", fpath_data3x6], data3x6ExpectedPermuteWt3); 3397 testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 3398 testTsvSample(["test-a9b", "-H", "-s", "--print-random", "-w", "field_c", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 3399 testTsvSample(["test-a9c", "-H", "-s", "--print-random", "-w", "f*c", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 3400 testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 3401 testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 3402 testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs); 3403 testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs); 3404 testTsvSample(["test-a13b", "-H", "-v", "41", "-w", "field_c", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs); 3405 3406 /* Shuffling, without compatibility mode, or with both compatibility and printing. */ 3407 testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty); 3408 testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0); 3409 testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1); 3410 testTsvSample(["test-aa4", "-H", "-s", fpath_data3x2], data3x2PermuteShuffle); 3411 testTsvSample(["test-aa5", "-H", "-s", fpath_data3x3], data3x3ExpectedPermuteSwap); 3412 testTsvSample(["test-aa6", "-H", "-s", fpath_data3x6], data3x6ExpectedPermuteSwap); 3413 testTsvSample(["test-aa7", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3); 3414 testTsvSample(["test-aa8", "-H", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 3415 testTsvSample(["test-aa8b", "-H", "-s", "--print-random", "-w", "field_c", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 3416 testTsvSample(["test-aa9", "-H", "--seed-value", "41", "--print-random", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 3417 3418 /* Reservoir sampling using Algorithm R. 3419 * (Note: reservoirSamplingViaHeap is tested later in the length-based iteration loops.) 3420 */ 3421 testTsvSample(["test-aa10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty); 3422 testTsvSample(["test-aa11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty); 3423 testTsvSample(["test-aa12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x0], data3x0); 3424 testTsvSample(["test-aa13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x0], data3x0); 3425 testTsvSample(["test-aa14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x1], data3x1); 3426 testTsvSample(["test-aa15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x1], data3x1); 3427 testTsvSample(["test-aa16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6); 3428 testTsvSample(["test-aa17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6); 3429 testTsvSample(["test-aa18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5); 3430 testTsvSample(["test-aa19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4); 3431 testTsvSample(["test-aa20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3); 3432 testTsvSample(["test-aa21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2); 3433 testTsvSample(["test-aa22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1); 3434 3435 /* Inorder versions of Algorithm R tests. */ 3436 testTsvSample(["test-ai10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty); 3437 testTsvSample(["test-ai11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty); 3438 testTsvSample(["test-ai12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0); 3439 testTsvSample(["test-ai13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0); 3440 testTsvSample(["test-ai14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1); 3441 testTsvSample(["test-ai15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1); 3442 testTsvSample(["test-ai16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder); 3443 testTsvSample(["test-ai17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder); 3444 testTsvSample(["test-ai18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5Inorder); 3445 testTsvSample(["test-ai19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4Inorder); 3446 testTsvSample(["test-ai20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3Inorder); 3447 testTsvSample(["test-ai21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2Inorder); 3448 testTsvSample(["test-ai22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1Inorder); 3449 3450 /* Bernoulli sampling cases. */ 3451 testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty); 3452 testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0); 3453 testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1); 3454 testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6); 3455 testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6); 3456 testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 3457 testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60); 3458 testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliCompatP60); 3459 testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs); 3460 3461 /* Bernoulli sampling with probabilities in skip sampling range or preferring skip sampling. */ 3462 testTsvSample(["test-ab1", "-H", "--seed-value", "333", "--prob", "0.01", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P01); 3463 testTsvSample(["test-ab2", "-H", "--seed-value", "333", "--prob", "0.02", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P02); 3464 testTsvSample(["test-ab3", "-H", "--seed-value", "333", "--prob", "0.03", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03); 3465 testTsvSample(["test-ab4", "-H", "--seed-value", "333", "--prob", "0.01", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P01); 3466 testTsvSample(["test-ab5", "-H", "--seed-value", "333", "--prob", "0.02", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P02); 3467 testTsvSample(["test-ab6", "-H", "--seed-value", "333", "--prob", "0.03", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P03); 3468 testTsvSample(["test-ab7", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40); 3469 3470 /* Distinct sampling cases. */ 3471 testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty); 3472 testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0); 3473 testTsvSample(["test-a24b", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "field_a", fpath_data3x0], data3x0); 3474 testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1); 3475 testTsvSample(["test-a25b", "-H", "-s", "-p", "1.0", "-k", "field_b", fpath_data3x1], data3x1); 3476 testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6); 3477 testTsvSample(["test-a26b", "-H", "-s", "-p", "1.0", "-k", "field_b", fpath_data3x6], data3x6); 3478 testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctK1K3P60); 3479 testTsvSample(["test-a27b", "-H", "-s", "-p", "0.6", "-k", "field_a,field_c", fpath_data3x6], data3x6ExpectedDistinctK1K3P60); 3480 3481 /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling. 3482 * For weighted sampling, use the weighted cases, but with expected using the original ordering. 3483 */ 3484 testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 3485 testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 3486 testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], 3487 data3x6ExpectedWt3ProbsInorder); 3488 testTsvSample(["test-a30b", "-H", "-s", "--gen-random-inorder", "--weight-field", "field_c", fpath_data3x6], 3489 data3x6ExpectedWt3ProbsInorder); 3490 testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], 3491 data3x6ExpectedWt3V41ProbsInorder); 3492 testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6], 3493 data3x6ExpectedDistinctK1K3P60Probs); 3494 testTsvSample(["test-a32b", "-H", "-s", "-p", "0.6", "-k", "field_a,field_c", "--print-random", fpath_data3x6], 3495 data3x6ExpectedDistinctK1K3P60Probs); 3496 testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header", 3497 "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctK1K3P60ProbsRVCustom); 3498 testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6], 3499 data3x6ExpectedDistinctK2P2ProbsInorder); 3500 3501 /* Simple random sampling with replacement. */ 3502 testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty); 3503 testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty); 3504 testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0); 3505 testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0); 3506 testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplaceNum3); 3507 testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10); 3508 testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77); 3509 3510 /* Shuffling, compatibility mode, without headers. */ 3511 testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1 .. $]); 3512 testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1 .. $]); 3513 testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1 .. $]); 3514 testTsvSample(["test-b4", "-s", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1 .. $]); 3515 testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1 .. $]); 3516 testTsvSample(["test-b6", "-s", "--weight-field", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]); 3517 testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]); 3518 testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]); 3519 testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1 .. $]); 3520 3521 /* Shuffling, no headers, without compatibility mode, or with printing and compatibility mode. */ 3522 testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1 .. $]); 3523 testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1 .. $]); 3524 testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1 .. $]); 3525 testTsvSample(["test-bb4", "-s", fpath_data3x6_noheader], data3x6ExpectedPermuteSwap[1 .. $]); 3526 testTsvSample(["test-bb5", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]); 3527 testTsvSample(["test-bb6", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]); 3528 testTsvSample(["test-bb7", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]); 3529 3530 /* Reservoir sampling using Algorithm R, no headers. */ 3531 testTsvSample(["test-ac10", "--prefer-algorithm-r", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty); 3532 testTsvSample(["test-ac11", "--prefer-algorithm-r", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty); 3533 testTsvSample(["test-ac14", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x1_noheader], data3x1[1 .. $]); 3534 testTsvSample(["test-ac15", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x1_noheader], data3x1[1 .. $]); 3535 testTsvSample(["test-ac16", "--prefer-algorithm-r", "-s", "--num", "7", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]); 3536 testTsvSample(["test-ac17", "--prefer-algorithm-r", "-s", "--num", "6", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]); 3537 testTsvSample(["test-ac18", "--prefer-algorithm-r", "-s", "--num", "5", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5[1 .. $]); 3538 testTsvSample(["test-ac19", "--prefer-algorithm-r", "-s", "--num", "4", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4[1 .. $]); 3539 testTsvSample(["test-ac20", "--prefer-algorithm-r", "-s", "--num", "3", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3[1 .. $]); 3540 testTsvSample(["test-ac21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2[1 .. $]); 3541 testTsvSample(["test-ac22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1[1 .. $]); 3542 3543 /* Reservoir sampling using Algorithm R, no headers, inorder output. */ 3544 testTsvSample(["test-aj10", "--prefer-algorithm-r", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty); 3545 testTsvSample(["test-aj11", "--prefer-algorithm-r", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty); 3546 testTsvSample(["test-aj14", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 3547 testTsvSample(["test-aj15", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 3548 testTsvSample(["test-aj16", "--prefer-algorithm-r", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]); 3549 testTsvSample(["test-aj17", "--prefer-algorithm-r", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]); 3550 testTsvSample(["test-aj18", "--prefer-algorithm-r", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5Inorder[1 .. $]); 3551 testTsvSample(["test-aj19", "--prefer-algorithm-r", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4Inorder[1 .. $]); 3552 testTsvSample(["test-aj20", "--prefer-algorithm-r", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3Inorder[1 .. $]); 3553 testTsvSample(["test-aj21", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2Inorder[1 .. $]); 3554 testTsvSample(["test-aj22", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1Inorder[1 .. $]); 3555 3556 /* Bernoulli sampling cases. */ 3557 testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1 .. $]); 3558 testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]); 3559 testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]); 3560 testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]); 3561 testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1 .. $]); 3562 testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1 .. $]); 3563 3564 /* Bernoulli sampling with probabilities in skip sampling range. */ 3565 testTsvSample(["test-bb1", "-v", "333", "-p", "0.01", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P01[1 .. $]); 3566 testTsvSample(["test-bb2", "-v", "333", "-p", "0.02", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P02[1 .. $]); 3567 testTsvSample(["test-bb3", "-v", "333", "-p", "0.03", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1 .. $]); 3568 testTsvSample(["test-bb4", "-v", "333", "-p", "0.01", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P01[1 .. $]); 3569 testTsvSample(["test-bb5", "-v", "333", "-p", "0.02", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P02[1 .. $]); 3570 testTsvSample(["test-bb6", "-v", "333", "-p", "0.03", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P03[1 .. $]); 3571 testTsvSample(["test-bb7", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1 .. $]); 3572 3573 /* Distinct sampling cases. */ 3574 testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1 .. $]); 3575 testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]); 3576 testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]); 3577 testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]); 3578 3579 /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */ 3580 testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]); 3581 testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1 .. $]); 3582 testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader], 3583 data3x6ExpectedDistinctK1K3P60Probs[1 .. $]); 3584 testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader], 3585 data3x6ExpectedDistinctK2P2ProbsInorder[1 .. $]); 3586 3587 /* Simple random sampling with replacement. */ 3588 testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty); 3589 testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty); 3590 testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplaceNum3[1 .. $]); 3591 testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10[1 .. $]); 3592 testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1 .. $]); 3593 3594 /* Multi-file tests. */ 3595 testTsvSample(["test-c1", "--header", "--static-seed", "--compatibility-mode", 3596 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3597 combo1ExpectedPermuteCompat); 3598 testTsvSample(["test-c2", "--header", "--static-seed", "--print-random", 3599 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3600 combo1ExpectedPermuteCompatProbs); 3601 testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3", 3602 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3603 combo1ExpectedPermuteWt3Probs); 3604 testTsvSample(["test-c3b", "--header", "--static-seed", "--print-random", "--weight-field", "field_c", 3605 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3606 combo1ExpectedPermuteWt3Probs); 3607 testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", "--compatibility-mode", 3608 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3609 combo1ExpectedPermuteWt3); 3610 testTsvSample(["test-c5", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", 3611 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3612 combo1ExpectedSampleAlgoRNum4); 3613 testTsvSample(["test-c5b", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder", 3614 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3615 combo1ExpectedSampleAlgoRNum4Inorder); 3616 3617 /* Multi-file, no headers. */ 3618 testTsvSample(["test-c6", "--static-seed", "--compatibility-mode", 3619 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3620 fpath_data3x6_noheader, fpath_data3x2_noheader], 3621 combo1ExpectedPermuteCompat[1 .. $]); 3622 testTsvSample(["test-c7", "--static-seed", "--print-random", 3623 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3624 fpath_data3x6_noheader, fpath_data3x2_noheader], 3625 combo1ExpectedPermuteCompatProbs[1 .. $]); 3626 testTsvSample(["test-c8", "--static-seed", "--print-random", "--weight-field", "3", 3627 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3628 fpath_data3x6_noheader, fpath_data3x2_noheader], 3629 combo1ExpectedPermuteWt3Probs[1 .. $]); 3630 testTsvSample(["test-c9", "--static-seed", "--weight-field", "3", "--compatibility-mode", 3631 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3632 fpath_data3x6_noheader, fpath_data3x2_noheader], 3633 combo1ExpectedPermuteWt3[1 .. $]); 3634 testTsvSample(["test-c10", "--static-seed", "--prefer-algorithm-r", "--num", "4", 3635 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3636 fpath_data3x6_noheader, fpath_data3x2_noheader], 3637 combo1ExpectedSampleAlgoRNum4[1 .. $]); 3638 testTsvSample(["test-c10b", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder", 3639 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3640 fpath_data3x6_noheader, fpath_data3x2_noheader], 3641 combo1ExpectedSampleAlgoRNum4Inorder[1 .. $]); 3642 3643 /* Bernoulli sampling cases. */ 3644 testTsvSample(["test-c11", "--header", "--static-seed", "--print-random", "--prob", ".5", 3645 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3646 combo1ExpectedBernoulliCompatP50Probs); 3647 testTsvSample(["test-c12", "--header", "--static-seed", "--prob", ".4", 3648 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3649 combo1ExpectedBernoulliCompatP40); 3650 testTsvSample(["test-c13", "--static-seed", "--print-random", "--prob", ".5", 3651 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3652 fpath_data3x6_noheader, fpath_data3x2_noheader], 3653 combo1ExpectedBernoulliCompatP50Probs[1 .. $]); 3654 testTsvSample(["test-c14", "--static-seed", "--prob", ".4", 3655 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3656 fpath_data3x6_noheader, fpath_data3x2_noheader], 3657 combo1ExpectedBernoulliCompatP40[1 .. $]); 3658 3659 /* Bernoulli sampling with probabilities in skip sampling range. */ 3660 testTsvSample(["test-cc1", "-H", "-v", "333", "-p", "0.03", 3661 fpath_data3x0, fpath_data3x1, fpath_data1x200, fpath_dataEmpty, fpath_data1x10], 3662 combo2ExpectedBernoulliSkipV333P03); 3663 testTsvSample(["test-cc1", "-v", "333", "-p", "0.03", 3664 fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader], 3665 combo2ExpectedBernoulliSkipV333P03[1 .. $]); 3666 3667 /* Distinct sampling cases. */ 3668 testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4", 3669 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3670 combo1ExpectedDistinctK1P40); 3671 testTsvSample(["test-c13b", "--header", "--static-seed", "--key-fields", "field_a", "--prob", ".4", 3672 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3673 combo1ExpectedDistinctK1P40); 3674 testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--prob", ".4", 3675 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3676 fpath_data3x6_noheader, fpath_data3x2_noheader], 3677 combo1ExpectedDistinctK1P40[1 .. $]); 3678 3679 /* Generating random weights. */ 3680 testTsvSample(["test-c15", "--header", "--static-seed", "--gen-random-inorder", 3681 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3682 combo1ExpectedProbsInorder); 3683 testTsvSample(["test-c16", "--static-seed", "--gen-random-inorder", 3684 fpath_data3x3_noheader, fpath_data3x1_noheader, 3685 fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader], 3686 combo1ExpectedProbsInorder[1 .. $]); 3687 3688 /* Simple random sampling with replacement. */ 3689 testTsvSample(["test-c17", "--header", "--static-seed", "--replace", "--num", "10", 3690 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3691 combo1ExpectedReplaceNum10); 3692 3693 testTsvSample(["test-c18", "--static-seed", "--replace", "--num", "10", 3694 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3695 fpath_data3x6_noheader, fpath_data3x2_noheader], 3696 combo1ExpectedReplaceNum10[1 .. $]); 3697 3698 /* Single column file. */ 3699 testTsvSample(["test-d1", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat); 3700 testTsvSample(["test-d2", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat); 3701 3702 /* Distributions. */ 3703 testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs); 3704 testTsvSample(["test-e1b", "-H", "-s", "-w", "weight", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs); 3705 testTsvSample(["test-e2", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedPermuteWt2Probs); 3706 testTsvSample(["test-e3", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedPermuteWt2Probs); 3707 testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs); 3708 testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs); 3709 3710 /* Tests of subset sample (--n|num) field. Random sampling, Bernoulli sampling, distinct sampling. 3711 * 3712 * Note: The way these tests are done ensures that subset length does not affect 3713 * output order. 3714 */ 3715 import std.algorithm : min; 3716 for (size_t n = data3x6.length + 2; n >= 1; n--) 3717 { 3718 /* reservoirSamplingViaHeap. 3719 */ 3720 size_t expectedLength = min(data3x6.length, n + 1); 3721 testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string, 3722 "-H", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]); 3723 3724 testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string, 3725 "-H", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]); 3726 3727 testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string, 3728 "-H", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs[0..expectedLength]); 3729 3730 testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string, 3731 "-H", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3[0..expectedLength]); 3732 3733 testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string, 3734 "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs[0..expectedLength]); 3735 3736 testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string, 3737 fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..expectedLength]); 3738 3739 testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string, 3740 "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..expectedLength]); 3741 3742 testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string, 3743 "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..expectedLength]); 3744 3745 testTsvSample([format("test-f9_%d", n), "-s", "-n", n.to!string, 3746 "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..expectedLength]); 3747 3748 /* Bernoulli sampling. 3749 */ 3750 import std.algorithm : min; 3751 size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedBernoulliCompatProbsP60.length); 3752 3753 testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 3754 "-H", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60[0..sampleExpectedLength]); 3755 3756 testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 3757 "-H", fpath_data3x6], data3x6ExpectedBernoulliCompatP60[0..sampleExpectedLength]); 3758 3759 testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 3760 "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..sampleExpectedLength]); 3761 3762 testTsvSample([format("test-f13_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 3763 fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60[1..sampleExpectedLength]); 3764 3765 /* Distinct Sampling. 3766 */ 3767 size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctK1K3P60.length); 3768 3769 testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, 3770 "-H", fpath_data3x6], data3x6ExpectedDistinctK1K3P60[0..distinctExpectedLength]); 3771 3772 testTsvSample([format("test-f15_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, 3773 fpath_data3x6_noheader], data3x6ExpectedDistinctK1K3P60[1..distinctExpectedLength]); 3774 3775 testTsvSample([format("test-f16_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, 3776 "-H", fpath_data3x6], data3x6ExpectedBernoulliProbsP100[0..expectedLength]); 3777 3778 testTsvSample([format("test-f17_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, 3779 fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]); 3780 } 3781 3782 /* Similar tests with the 1x10 data set. */ 3783 for (size_t n = data1x10.length + 2; n >= 1; n--) 3784 { 3785 size_t expectedLength = min(data1x10.length, n + 1); 3786 testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string, 3787 "-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]); 3788 3789 testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string, 3790 "-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]); 3791 3792 testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string, 3793 fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]); 3794 3795 testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string, 3796 "-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]); 3797 } 3798 3799 /* Simple random sampling with replacement: ensure sample size doesn't change order. */ 3800 for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--) 3801 { 3802 testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6], 3803 data3x6ExpectedReplaceNum10[0 .. n + 1]); 3804 3805 testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader], 3806 data3x6ExpectedReplaceNum10[1 .. n + 1]); 3807 } 3808 3809 /* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */ 3810 for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--) 3811 { 3812 size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1); 3813 3814 testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string, 3815 "-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]); 3816 3817 testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string, 3818 fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]); 3819 } 3820 3821 /* Inorder sampling tests using reservoir sampling via heap (compatibility mode). */ 3822 testTsvSample(["test-ar10", "--compatibility-mode", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty); 3823 testTsvSample(["test-ar11", "--compatibility-mode", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty); 3824 testTsvSample(["test-ar12", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0); 3825 testTsvSample(["test-ar13", "--compatibility-mode", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0); 3826 testTsvSample(["test-ar14", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1); 3827 testTsvSample(["test-ar15", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1); 3828 testTsvSample(["test-ar16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder); 3829 testTsvSample(["test-ar17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder); 3830 testTsvSample(["test-ar18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum5Inorder); 3831 testTsvSample(["test-ar19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum4Inorder); 3832 testTsvSample(["test-ar20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum3Inorder); 3833 testTsvSample(["test-ar21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum2Inorder); 3834 testTsvSample(["test-ar22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum1Inorder); 3835 3836 testTsvSample(["test-as10", "--compatibility-mode", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty); 3837 testTsvSample(["test-as11", "--compatibility-mode", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty); 3838 testTsvSample(["test-as14", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 3839 testTsvSample(["test-as15", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 3840 testTsvSample(["test-as16", "--compatibility-mode", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]); 3841 testTsvSample(["test-as17", "--compatibility-mode", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]); 3842 testTsvSample(["test-as18", "--compatibility-mode", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5Inorder[1 .. $]); 3843 testTsvSample(["test-as19", "--compatibility-mode", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4Inorder[1 .. $]); 3844 testTsvSample(["test-as20", "--compatibility-mode", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3Inorder[1 .. $]); 3845 testTsvSample(["test-as21", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2Inorder[1 .. $]); 3846 testTsvSample(["test-as22", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1Inorder[1 .. $]); 3847 3848 /* Inorder sampling tests with random number printing. --compatibility-mode not needed. */ 3849 testTsvSample(["test-at16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder); 3850 testTsvSample(["test-at17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder); 3851 testTsvSample(["test-at18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum5ProbsInorder); 3852 testTsvSample(["test-at19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder); 3853 testTsvSample(["test-at19", "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder); 3854 testTsvSample(["test-at20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder); 3855 testTsvSample(["test-at20", "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder); 3856 testTsvSample(["test-at21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum2ProbsInorder); 3857 testTsvSample(["test-at22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum1ProbsInorder); 3858 3859 testTsvSample(["test-au16", "--compatibility-mode", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]); 3860 testTsvSample(["test-au17", "--compatibility-mode", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]); 3861 testTsvSample(["test-au18", "--compatibility-mode", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5ProbsInorder[1 .. $]); 3862 testTsvSample(["test-au19", "--compatibility-mode", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]); 3863 testTsvSample(["test-au19", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]); 3864 testTsvSample(["test-au20", "--compatibility-mode", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3ProbsInorder[1 .. $]); 3865 testTsvSample(["test-au21", "--compatibility-mode", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2ProbsInorder[1 .. $]); 3866 testTsvSample(["test-au22", "--compatibility-mode", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1ProbsInorder[1 .. $]); 3867 3868 /* Inorder weighted sampling tests. */ 3869 testTsvSample(["test-ax16", "-H", "-s", "-n", "7", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder); 3870 testTsvSample(["test-ax17", "-H", "-s", "-n", "6", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder); 3871 testTsvSample(["test-ax18", "-H", "-s", "-n", "5", "-i", fpath_data3x6], data3x6ExpectedWt3Num5Inorder); 3872 testTsvSample(["test-ax19", "-H", "-s", "-n", "4", "-i", fpath_data3x6], data3x6ExpectedWt3Num4Inorder); 3873 testTsvSample(["test-ax20", "-H", "-s", "-n", "3", "-i", fpath_data3x6], data3x6ExpectedWt3Num3Inorder); 3874 testTsvSample(["test-ax21", "-H", "-s", "-n", "2", "-i", fpath_data3x6], data3x6ExpectedWt3Num2Inorder); 3875 testTsvSample(["test-ax22", "-H", "-s", "-n", "1", "-i", fpath_data3x6], data3x6ExpectedWt3Num1Inorder); 3876 3877 testTsvSample(["test-ay16", "-s", "-n", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]); 3878 testTsvSample(["test-ay17", "-s", "-n", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]); 3879 testTsvSample(["test-ay18", "-s", "-n", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num5Inorder[1 .. $]); 3880 testTsvSample(["test-ay19", "-s", "-n", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num4Inorder[1 .. $]); 3881 testTsvSample(["test-ay20", "-s", "-n", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num3Inorder[1 .. $]); 3882 testTsvSample(["test-ay21", "-s", "-n", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num2Inorder[1 .. $]); 3883 testTsvSample(["test-ay22", "-s", "-n", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num1Inorder[1 .. $]); 3884 3885 /* 3886 * Distinct sampling tests. 3887 */ 3888 testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25], 3889 data5x25ExpectedDistinctK2P40); 3890 3891 testTsvSample(["test-j1b", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "Shape", fpath_data5x25], 3892 data5x25ExpectedDistinctK2P40); 3893 3894 testTsvSample(["test-j2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25], 3895 data5x25ExpectedDistinctK2K4P20); 3896 3897 testTsvSample(["test-j2b", "-H", "-s", "-p", "0.20", "-k", "Shape,Size", fpath_data5x25], 3898 data5x25ExpectedDistinctK2K4P20); 3899 3900 testTsvSample(["test-j3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25], 3901 data5x25ExpectedDistinctK2K3K4P20); 3902 3903 testTsvSample(["test-j3b", "-H", "-s", "-p", "0.20", "-k", "Shape-Size", fpath_data5x25], 3904 data5x25ExpectedDistinctK2K3K4P20); 3905 3906 testTsvSample(["test-j4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader], 3907 data5x25ExpectedDistinctK2P40[1 .. $]); 3908 3909 testTsvSample(["test-j5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader], 3910 data5x25ExpectedDistinctK2K4P20[1 .. $]); 3911 3912 testTsvSample(["test-j6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader], 3913 data5x25ExpectedDistinctK2K3K4P20[1 .. $]); 3914 3915 3916 /* These distinct tests check that the whole line as '-k 0' and specifying all fields 3917 * in order have the same result. Also that field numbers don't matter, as '-k 1,2' 3918 * in data2x25 are the same keys as '-k 2,4' in data5x25. 3919 */ 3920 testTsvSample(["test-j7", "-H", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25], 3921 data2x25ExpectedDistinctK1K2P20); 3922 3923 testTsvSample(["test-j8", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data2x25], 3924 data2x25ExpectedDistinctK1K2P20); 3925 3926 testTsvSample(["test-j8b", "-H", "-s", "-p", "0.20", "-k", "*", fpath_data2x25], 3927 data2x25ExpectedDistinctK1K2P20); 3928 3929 testTsvSample(["test-j9", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25_noheader], 3930 data2x25ExpectedDistinctK1K2P20[1 .. $]); 3931 3932 testTsvSample(["test-j10", "-s", "-p", "0.20", "-k", "0", fpath_data2x25_noheader], 3933 data2x25ExpectedDistinctK1K2P20[1 .. $]); 3934 3935 /* Similar to the last set, but for a 1-column file. Also with random value printing. */ 3936 testTsvSample(["test-j11", "-H", "-s", "-p", "0.20", "-k", "1", fpath_data1x25], 3937 data1x25ExpectedDistinctK1P20); 3938 3939 testTsvSample(["test-j12", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data1x25], 3940 data1x25ExpectedDistinctK1P20); 3941 3942 testTsvSample(["test-j12b", "-H", "-s", "-p", "0.20", "-k", "*", fpath_data1x25], 3943 data1x25ExpectedDistinctK1P20); 3944 3945 testTsvSample(["test-j13", "-s", "-p", "0.20", "-k", "1", fpath_data1x25_noheader], 3946 data1x25ExpectedDistinctK1P20[1 .. $]); 3947 3948 testTsvSample(["test-j14", "-s", "-p", "0.20", "-k", "0", fpath_data1x25_noheader], 3949 data1x25ExpectedDistinctK1P20[1 .. $]); 3950 3951 testTsvSample(["test-j15", "-H", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25], 3952 data1x25ExpectedDistinctK1P20Probs); 3953 3954 testTsvSample(["test-j15b", "-H", "-s", "-p", "0.20", "-k", `Shape\-Size`, "--print-random", fpath_data1x25], 3955 data1x25ExpectedDistinctK1P20Probs); 3956 3957 testTsvSample(["test-j16", "-H", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25], 3958 data1x25ExpectedDistinctK1P20Probs); 3959 3960 testTsvSample(["test-j16b", "-H", "-s", "-p", "0.20", "-k", "*", "--print-random", fpath_data1x25], 3961 data1x25ExpectedDistinctK1P20Probs); 3962 3963 testTsvSample(["test-j17", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25_noheader], 3964 data1x25ExpectedDistinctK1P20Probs[1 .. $]); 3965 3966 testTsvSample(["test-j18", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25_noheader], 3967 data1x25ExpectedDistinctK1P20Probs[1 .. $]); 3968 3969 testTsvSample(["test-j19", "-H", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25], 3970 data1x25ExpectedDistinctK1P20ProbsInorder); 3971 3972 testTsvSample(["test-j19b", "-H", "-s", "-p", "0.20", "-k", `Shape\-Size`, "--gen-random-inorder", fpath_data1x25], 3973 data1x25ExpectedDistinctK1P20ProbsInorder); 3974 3975 testTsvSample(["test-j20", "-H", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25], 3976 data1x25ExpectedDistinctK1P20ProbsInorder); 3977 3978 testTsvSample(["test-j20b", "-H", "-s", "-p", "0.20", "-k", "*", "--gen-random-inorder", fpath_data1x25], 3979 data1x25ExpectedDistinctK1P20ProbsInorder); 3980 3981 testTsvSample(["test-j21", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25_noheader], 3982 data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]); 3983 3984 testTsvSample(["test-j22", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25_noheader], 3985 data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]); 3986 3987 }