1 /** 2 Command line tool for shuffling or sampling lines from input streams. Several methods 3 are available, including weighted and unweighted shuffling, simple and weighted random 4 sampling, sampling with replacement, Bernoulli sampling, and distinct sampling. 5 6 Copyright (c) 2017-2021, eBay Inc. 7 Initially written by Jon Degenhardt 8 9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 10 */ 11 module tsv_utils.tsv_sample; 12 13 import std.array : appender, Appender, RefAppender; 14 import std.exception : enforce; 15 import std.format : format; 16 import std.range; 17 import std.stdio; 18 import std.typecons : tuple, Flag; 19 20 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 21 22 version(unittest) 23 { 24 // When running unit tests, use main from -main compiler switch. 25 } 26 else 27 { 28 /** Main program. 29 * 30 * Invokes command line argument processing and calls tsvSample to do the real 31 * work. Errors occurring during processing are caught and reported to the user. 32 */ 33 int main(string[] cmdArgs) 34 { 35 import tsv_utils.common.utils : BufferedOutputRange, LineBuffered; 36 37 /* When running in DMD code coverage mode, turn on report merging. */ 38 version(D_Coverage) version(DigitalMars) 39 { 40 import core.runtime : dmd_coverSetMerge; 41 dmd_coverSetMerge(true); 42 } 43 44 TsvSampleOptions cmdopt; 45 const r = cmdopt.processArgs(cmdArgs); 46 if (!r[0]) return r[1]; 47 version(LDC_Profile) 48 { 49 import ldc.profile : resetAll; 50 resetAll(); 51 } 52 53 immutable LineBuffered linebuffered = cmdopt.lineBuffered ? Yes.lineBuffered : No.lineBuffered; 54 55 try tsvSample(cmdopt, BufferedOutputRange!(typeof(stdout))(stdout, linebuffered)); 56 catch (Exception exc) 57 { 58 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 59 return 1; 60 } 61 return 0; 62 } 63 } 64 65 immutable helpText = q"EOS 66 Synopsis: tsv-sample [options] [file...] 67 68 Sample input lines or randomize their order. Several modes of operation 69 are available: 70 * Shuffling (the default): All input lines are output in random order. All 71 orderings are equally likely. 72 * Random sampling (--n|num N): A random sample of N lines are selected and 73 written to standard output. By default, selected lines are written in 74 random order. All sample sets and orderings are equally likely. Use 75 --i|inorder to write the selected lines in the original input order. 76 * Weighted random sampling (--n|num N, --w|weight-field F): A weighted 77 sample of N lines is produced. Weights are taken from field F. Lines are 78 output in weighted selection order. Use --i|inorder to write in original 79 input order. Omit --n|num to shuffle all lines (weighted shuffling). 80 * Sampling with replacement (--r|replace, --n|num N): All input lines are 81 read in, then lines are repeatedly selected at random and written out. 82 This continues until N lines are output. Individual lines can be written 83 multiple times. Output continues forever if N is zero or not provided. 84 * Bernoulli sampling (--p|prob P): A random subset of lines is selected 85 based on probability P, a 0.0-1.0 value. This is a streaming operation. 86 A decision is made on each line as it is read. Line order is not changed. 87 * Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled 88 based on the values in the key fields. A subset of keys are chosen based 89 on the inclusion probability (a 'distinct' set of keys). All lines with 90 one of the selected keys are output. Line order is not changed. 91 92 Fields are specified using field number or field name. Field names require 93 that the input file has a header line. 94 95 Use '--help-verbose' for detailed information. 96 97 Options: 98 EOS"; 99 100 immutable helpTextVerbose = q"EOS 101 Synopsis: tsv-sample [options] [file...] 102 103 Sample input lines or randomize their order. Several modes of operation 104 are available: 105 * Shuffling (the default): All input lines are output in random order. All 106 orderings are equally likely. 107 * Random sampling (--n|num N): A random sample of N lines are selected and 108 written to standard output. By default, selected lines are written in 109 random order. All sample sets and orderings are equally likely. Use 110 --i|inorder to write the selected lines in the original input order. 111 * Weighted random sampling (--n|num N, --w|weight-field F): A weighted 112 sample of N lines is produced. Weights are taken from field F. Lines are 113 output in weighted selection order. Use --i|inorder to write in original 114 input order. Omit --n|num to shuffle all lines (weighted shuffling). 115 * Sampling with replacement (--r|replace, --n|num N): All input lines are 116 read in, then lines are repeatedly selected at random and written out. 117 This continues until N lines are output. Individual lines can be written 118 multiple times. Output continues forever if N is zero or not provided. 119 * Bernoulli sampling (--p|prob P): A random subset of lines is selected 120 based on probability P, a 0.0-1.0 value. This is a streaming operation. 121 A decision is made on each line as it is read. Line order is not changed. 122 * Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled 123 based on the values in the key fields. A subset of keys are chosen based 124 on the inclusion probability (a 'distinct' set of keys). All lines with 125 one of the selected keys are output. Line order is not changed. 126 127 Fields: Fields are specified by field number or name. Field names require 128 the input file to have a header line. Use '--help-fields' for details. 129 130 Sample size: The '--n|num' option controls the sample size for all 131 sampling methods. In the case of simple and weighted random sampling it 132 also limits the amount of memory required. 133 134 Controlling the random seed: By default, each run produces a different 135 randomization or sampling. Using '--s|static-seed' changes this so 136 multiple runs produce the same results. This works by using the same 137 random seed each run. The random seed can be specified using 138 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero 139 value is a no-op and ignored.) 140 141 Memory use: Bernoulli sampling and distinct sampling make decisions on 142 each line as it is read, there is no memory accumulation. These algorithms 143 can run on arbitrary size inputs. Sampling with replacement reads all 144 lines into memory and is limited by available memory. Shuffling also reads 145 all lines into memory and is similarly limited. Random sampling uses 146 reservoir sampling, and only needs to hold the sample size (--n|num) in 147 memory. The input data can be of any length. 148 149 Weighted sampling: Weighted random sampling is done using an algorithm 150 described by Pavlos Efraimidis and Paul Spirakis. Weights should be 151 positive values representing the relative weight of the entry in the 152 collection. Counts and similar can be used as weights, it is *not* 153 necessary to normalize to a [0,1] interval. Negative values are not 154 meaningful and given the value zero. Input order is not retained, instead 155 lines are output ordered by the randomized weight that was assigned. This 156 means that a smaller valid sample can be produced by taking the first N 157 lines of output. For more info on the sampling approach see: 158 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling 159 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis 160 (https://arxiv.org/abs/1012.0256) 161 162 Printing random values: Most of the sampling algorithms work by generating 163 a random value for each line. (See "Compatibility mode" below.) The nature 164 of these values depends on the sampling algorithm. They are used for both 165 line selection and output ordering. The '--p|print-random' option can be 166 used to print these values. The random value is prepended to the line 167 separated by the --d|delimiter char (TAB by default). The 168 '--gen-random-inorder' option takes this one step further, generating 169 random values for all input lines without changing the input order. The 170 types of values currently used by these sampling algorithms: 171 * Unweighted sampling: Uniform random value in the interval [0,1]. This 172 includes Bernoulli sampling and unweighted line order randomization. 173 * Weighted sampling: Value in the interval [0,1]. Distribution depends on 174 the values in the weight field. It is used as a partial ordering. 175 * Distinct sampling: An integer, zero and up, representing a selection 176 group. The inclusion probability determines the number of selection groups. 177 * Sampling with replacement: Random value printing is not supported. 178 179 The specifics behind these random values are subject to change in future 180 releases. 181 182 Compatibility mode: As described above, many of the sampling algorithms 183 assign a random value to each line. This is useful when printing random 184 values. It has another occasionally useful property: repeated runs with 185 the same static seed but different selection parameters are more 186 compatible with each other, as each line gets assigned the same random 187 value on every run. For example, if Bernoulli sampling is run with 188 '--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed', 189 all the lines selected in the first run will be selected in the second. 190 This comes at a cost: in some cases there are faster algorithms that don't 191 preserve this property. By default, tsv-sample will use faster algorithms 192 when available. However, the '--compatibility-mode' option switches to 193 algorithms that assign a random value per line. Printing random values 194 also engages compatibility mode. 195 196 Options: 197 EOS"; 198 199 /** Container for command line options and derived data. 200 * 201 * TsvSampleOptions handles several aspects of command line options. On the input side, 202 * it defines the command line options available, performs validation, and sets up any 203 * derived state based on the options provided. These activities are handled by the 204 * processArgs() member. 205 * 206 * Once argument processing is complete, TsvSampleOptions is used as a container 207 * holding the specific processing options used by the different sampling routines. 208 */ 209 struct TsvSampleOptions 210 { 211 import tsv_utils.common.utils : InputSourceRange; 212 213 string programName; /// Program name 214 InputSourceRange inputSources; /// Input files 215 bool hasHeader = false; /// --H|header 216 ulong sampleSize = 0; /// --n|num - Size of the desired sample 217 double inclusionProbability = double.nan; /// --p|prob - Inclusion probability 218 size_t[] keyFields; /// Derived: --k|key-fields - Used with inclusion probability 219 size_t weightField = 0; /// Derived: --w|weight-field - Field holding the weight 220 bool srsWithReplacement = false; /// --r|replace 221 bool preserveInputOrder = false; /// --i|inorder 222 bool staticSeed = false; /// --s|static-seed 223 uint seedValueOptionArg = 0; /// --v|seed-value 224 bool printRandom = false; /// --print-random 225 bool genRandomInorder = false; /// --gen-random-inorder 226 string randomValueHeader = "random_value"; /// --random-value-header 227 bool compatibilityMode = false; /// --compatibility-mode 228 char delim = '\t'; /// --d|delimiter 229 bool lineBuffered = false; /// --line-buffered 230 bool preferSkipSampling = false; /// --prefer-skip-sampling 231 bool preferAlgorithmR = false; /// --prefer-algorithm-r 232 bool hasWeightField = false; /// Derived. 233 bool useBernoulliSampling = false; /// Derived. 234 bool useDistinctSampling = false; /// Derived. 235 bool distinctKeyIsFullLine = false; /// Derived. True if '--k|key-fields 0' is specfied. 236 bool usingUnpredictableSeed = true; /// Derived from --static-seed, --seed-value 237 uint seed = 0; /// Derived from --static-seed, --seed-value 238 239 /** Process tsv-sample command line arguments. 240 * 241 * Defines the command line options, performs validation, and derives additional 242 * state. std.getopt.getopt is called to do the main option processing followed 243 * additional validation and derivation. 244 * 245 * Help text is printed to standard output if help was requested. Error text is 246 * written to stderr if invalid input is encountered. 247 * 248 * A tuple is returned. First value is true if command line arguments were 249 * successfully processed and execution should continue, or false if an error 250 * occurred or the user asked for help. If false, the second value is the 251 * appropriate exit code (0 or 1). 252 * 253 * Returning true (execution continues) means args have been validated and derived 254 * values calculated. Field indices will have been converted to zero-based. 255 */ 256 auto processArgs(ref string[] cmdArgs) 257 { 258 import std.algorithm : all, canFind, each; 259 import std.conv : to; 260 import std.getopt; 261 import std.math : isNaN; 262 import std.path : baseName, stripExtension; 263 import std.typecons : Yes, No; 264 import tsv_utils.common.utils : inputSourceRange, ReadHeader, throwIfWindowsNewline; 265 import tsv_utils.common.fieldlist; 266 267 bool helpVerbose = false; // --help-verbose 268 bool helpFields = false; // --help-fields 269 bool versionWanted = false; // --V|version 270 string keyFieldsArg; // --k|key-fields 271 string weightFieldArg; // --w|weight-field 272 273 string keyFieldsOptionString = "k|key-fields"; 274 string weightFieldOptionString = "w|weight-field"; 275 276 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 277 278 try 279 { 280 arraySep = ","; // Use comma to separate values in command line options 281 auto r = getopt( 282 cmdArgs, 283 "help-verbose", " Print more detailed help.", &helpVerbose, 284 "help-fields", " Print help on specifying fields.", &helpFields, 285 286 std.getopt.config.caseSensitive, 287 "H|header", " Treat the first line of each file as a header.", &hasHeader, 288 std.getopt.config.caseInsensitive, 289 290 "n|num", "NUM Maximum number of lines to output. All selected lines are output if not provided or zero.", &sampleSize, 291 "p|prob", "NUM Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability, 292 293 keyFieldsOptionString, 294 "<field-list> Fields to use as key for distinct sampling. Use with '--p|prob'. Specify '--k|key-fields 0' to use the entire line as the key.", 295 &keyFieldsArg, 296 297 weightFieldOptionString, 298 "NUM Field containing weights. All lines get equal weight if not provided.", 299 &weightFieldArg, 300 301 "r|replace", " Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement, 302 "i|inorder", " Output random samples in original input order. Requires use of --n|num.", &preserveInputOrder, 303 "s|static-seed", " Use the same random seed every run.", &staticSeed, 304 305 std.getopt.config.caseSensitive, 306 "v|seed-value", "NUM Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg, 307 std.getopt.config.caseInsensitive, 308 309 "print-random", " Include the assigned random value (prepended) when writing output lines.", &printRandom, 310 "gen-random-inorder", " Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder, 311 "random-value-header", " Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader, 312 "compatibility-mode", " Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode, 313 314 "d|delimiter", "CHR Field delimiter.", &delim, 315 "line-buffered", " Immediately output every sampled line. Applies to Bernoulli and distinct sampling. Ignored in modes where all input data must be read before generating output.", &lineBuffered, 316 317 std.getopt.config.caseSensitive, 318 "V|version", " Print version information and exit.", &versionWanted, 319 std.getopt.config.caseInsensitive, 320 321 "prefer-skip-sampling", " (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.", 322 &preferSkipSampling, 323 324 "prefer-algorithm-r", " (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.", 325 &preferAlgorithmR, 326 ); 327 328 if (r.helpWanted) 329 { 330 defaultGetoptPrinter(helpText, r.options); 331 return tuple(false, 0); 332 } 333 else if (helpVerbose) 334 { 335 defaultGetoptPrinter(helpTextVerbose, r.options); 336 return tuple(false, 0); 337 } 338 else if (helpFields) 339 { 340 writeln(fieldListHelpText); 341 return tuple(false, 0); 342 } 343 else if (versionWanted) 344 { 345 import tsv_utils.common.tsvutils_version; 346 writeln(tsvutilsVersionNotice("tsv-sample")); 347 return tuple(false, 0); 348 } 349 350 /* Input files. Remaining command line args are files. */ 351 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 352 cmdArgs.length = 1; 353 354 /* Validation and derivations - Do as much validation prior to header line 355 * processing as possible (avoids waiting on stdin). 356 * 357 * Note: keyFields and weightField depend on header line processing, but 358 * keyFieldsArg and weightFieldArg can be used to detect whether the 359 * command line argument was specified. 360 */ 361 362 /* Set hasWeightField here so it can be used in other validation checks. 363 * Field validity checked after reading file header. 364 */ 365 hasWeightField = !weightFieldArg.empty; 366 367 /* Sampling with replacement checks (--r|replace). */ 368 if (srsWithReplacement) 369 { 370 enforce(!hasWeightField, 371 "Sampling with replacement (--r|replace) does not support weights (--w|weight-field)."); 372 373 enforce(inclusionProbability.isNaN, 374 "Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob)."); 375 376 enforce(keyFieldsArg.empty, 377 "Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields)."); 378 379 enforce(!printRandom && !genRandomInorder, 380 "Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder)."); 381 382 enforce(!preserveInputOrder, 383 "Sampling with replacement (--r|replace) does not support input order preservation (--i|inorder option)."); 384 } 385 386 /* Distinct sampling checks (--k|key-fields --p|prob). */ 387 enforce(keyFieldsArg.empty | !inclusionProbability.isNaN, 388 "--p|prob is required when using --k|key-fields."); 389 390 /* Inclusion probability (--p|prob) is used for both Bernoulli sampling 391 * and distinct sampling. 392 */ 393 if (!inclusionProbability.isNaN) 394 { 395 enforce(inclusionProbability > 0.0 && inclusionProbability <= 1.0, 396 format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability)); 397 398 if (!keyFieldsArg.empty) useDistinctSampling = true; 399 else useBernoulliSampling = true; 400 401 enforce(!hasWeightField, "--w|weight-field and --p|prob cannot be used together."); 402 403 enforce(!genRandomInorder || useDistinctSampling, 404 "--gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used." ~ 405 "\nUse --gen-random-inorder alone to print probabilities for all lines." ~ 406 "\nUse --p|prob and --print-random to print probabilities for lines satisfying the probability threshold."); 407 } 408 else if (genRandomInorder && !hasWeightField) 409 { 410 useBernoulliSampling = true; 411 } 412 413 /* randomValueHeader (--random-value-header) validity. Note that 414 randomValueHeader is initialized to a valid, non-empty string. 415 */ 416 enforce(!randomValueHeader.empty && !randomValueHeader.canFind('\n') && 417 !randomValueHeader.canFind(delim), 418 "--randomValueHeader must be at least one character and not contain field delimiters or newlines."); 419 420 /* Check for incompatible use of (--i|inorder) and shuffling of the full 421 * data set. Sampling with replacement is also incompatible, this is 422 * detected earlier. Shuffling is the default operation, so it identified 423 * by eliminating the other modes of operation. 424 */ 425 enforce(!preserveInputOrder || 426 sampleSize != 0 || 427 useBernoulliSampling || 428 useDistinctSampling, 429 "Preserving input order (--i|inorder) is not compatible with full data set shuffling. Switch to random sampling with a sample size (--n|num) to use --i|inorder."); 430 431 /* Compatibility mode checks: 432 * - Random value printing implies compatibility-mode, otherwise user's 433 * selection is used. 434 * - Distinct sampling doesn't support compatibility-mode. The routines 435 * don't care, but users might expect larger probabilities to be a 436 * superset of smaller probabilities. This would be confusing, so 437 * flag it as an error. 438 */ 439 enforce(!(compatibilityMode && useDistinctSampling), 440 "Distinct sampling (--k|key-fields --p|prob) does not support --compatibility-mode."); 441 442 if (printRandom || genRandomInorder) compatibilityMode = true; 443 444 /* Ignore --line-buffered if not using Bernoulli or distinct sampling. */ 445 if (!useBernoulliSampling && !useDistinctSampling) lineBuffered = false; 446 447 /* Seed. */ 448 import std.random : unpredictableSeed; 449 450 usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0); 451 452 if (usingUnpredictableSeed) seed = unpredictableSeed; 453 else if (seedValueOptionArg != 0) seed = seedValueOptionArg; 454 else if (staticSeed) seed = 2438424139; 455 else assert(0, "Internal error, invalid seed option states."); 456 457 string[] headerFields; 458 459 /* fieldListArgProcessing encapsulates the field list processing. It is 460 * called prior to reading the header line if headers are not being used, 461 * and after if headers are being used. 462 */ 463 void fieldListArgProcessing() 464 { 465 if (!weightFieldArg.empty) 466 { 467 auto fieldIndices = 468 weightFieldArg 469 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero) 470 (hasHeader, headerFields, weightFieldOptionString) 471 .array; 472 473 enforce(fieldIndices.length == 1, 474 format("'--%s' must be a single field.", weightFieldOptionString)); 475 476 weightField = fieldIndices[0]; 477 } 478 479 if (!keyFieldsArg.empty) 480 { 481 keyFields = 482 keyFieldsArg 483 .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero) 484 (hasHeader, headerFields, keyFieldsOptionString) 485 .array; 486 487 assert(keyFields.length > 0); 488 489 if (keyFields.length > 0) 490 { 491 if (keyFields.length == 1 && keyFields[0] == 0) 492 { 493 distinctKeyIsFullLine = true; 494 } 495 else 496 { 497 enforce(keyFields.length <= 1 || keyFields.all!(x => x != 0), 498 "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields."); 499 500 keyFields.each!((ref x) => --x); // Convert to zero-based indexing. 501 } 502 } 503 } 504 } 505 506 if (!hasHeader) fieldListArgProcessing(); 507 508 /* 509 * Create the inputSourceRange and perform header line processing. 510 */ 511 ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader; 512 inputSources = inputSourceRange(filepaths, readHeader); 513 514 if (hasHeader) 515 { 516 throwIfWindowsNewline(inputSources.front.header, inputSources.front.name, 1); 517 headerFields = inputSources.front.header.split(delim).to!(string[]); 518 fieldListArgProcessing(); 519 } 520 521 } 522 catch (Exception exc) 523 { 524 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 525 return tuple(false, 1); 526 } 527 return tuple(true, 0); 528 } 529 } 530 /** Invokes the appropriate sampling routine based on the command line arguments. 531 * 532 * tsvSample is the top-level routine handling the different tsv-sample use cases. 533 * Its primary role is to invoke the correct routine for type of sampling requested. 534 */ 535 void tsvSample(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 536 if (isOutputRange!(OutputRange, char)) 537 { 538 if (cmdopt.srsWithReplacement) 539 { 540 simpleRandomSamplingWithReplacement(cmdopt, outputStream); 541 } 542 else if (cmdopt.useBernoulliSampling) 543 { 544 bernoulliSamplingCommand(cmdopt, outputStream); 545 } 546 else if (cmdopt.useDistinctSampling) 547 { 548 if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 549 else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream); 550 } 551 else if (cmdopt.genRandomInorder) 552 { 553 /* Note that the preceding cases handle gen-random-inorder themselves (Bernoulli, 554 * Distinct), or don't handle it (SRS w/ Replacement). 555 */ 556 assert(cmdopt.hasWeightField); 557 generateWeightedRandomValuesInorder(cmdopt, outputStream); 558 } 559 else if (cmdopt.sampleSize != 0) 560 { 561 randomSamplingCommand(cmdopt, outputStream); 562 } 563 else 564 { 565 shuffleCommand(cmdopt, outputStream); 566 } 567 } 568 569 /** Bernoulli sampling command handler. Invokes the appropriate Bernoulli sampling 570 * routine based on the command line arguments. 571 * 572 * This routine selects the appropriate Bernoulli sampling function and template 573 * instantiation to use based on the command line arguments. 574 * 575 * One of the basic choices is whether to use the vanilla algorithm or skip sampling. 576 * Skip sampling is a little bit faster when the inclusion probability is small but 577 * doesn't support compatibility mode. See the bernoulliSkipSampling documentation 578 * for a discussion of the skipSamplingProbabilityThreshold used here. 579 */ 580 void bernoulliSamplingCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 581 if (isOutputRange!(OutputRange, char)) 582 { 583 assert(!cmdopt.hasWeightField); 584 585 immutable double skipSamplingProbabilityThreshold = 0.04; 586 587 if (cmdopt.compatibilityMode || 588 (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling)) 589 { 590 if (cmdopt.genRandomInorder) 591 { 592 bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 593 } 594 else 595 { 596 bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream); 597 } 598 } 599 else 600 { 601 bernoulliSkipSampling(cmdopt, outputStream); 602 } 603 } 604 605 /** Bernoulli sampling of lines from the input stream. 606 * 607 * Each input line is a assigned a random value and output if less than 608 * cmdopt.inclusionProbability. The order of the lines is not changed. 609 * 610 * This routine supports random value printing and gen-random-inorder value printing. 611 */ 612 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) 613 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 614 if (isOutputRange!(OutputRange, char)) 615 { 616 import std.random : Random = Mt19937, uniform01; 617 import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, 618 InputSourceRange, LineBuffered, throwIfWindowsNewline; 619 620 static if (generateRandomAll) assert(cmdopt.genRandomInorder); 621 else assert(!cmdopt.genRandomInorder); 622 623 assert(!cmdopt.inputSources.empty); 624 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 625 626 auto randomGenerator = Random(cmdopt.seed); 627 628 /* First header is read during command line argument processing. */ 629 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 630 { 631 auto inputStream = cmdopt.inputSources.front; 632 633 static if (generateRandomAll) 634 { 635 outputStream.put(cmdopt.randomValueHeader); 636 outputStream.put(cmdopt.delim); 637 } 638 else if (cmdopt.printRandom) 639 { 640 outputStream.put(cmdopt.randomValueHeader); 641 outputStream.put(cmdopt.delim); 642 } 643 644 outputStream.put(inputStream.header); 645 outputStream.put("\n"); 646 647 /* Immediately flush the header so subsequent processes in a unix command 648 * pipeline see it early. This helps provide timely error messages. 649 */ 650 static if (isFlushableOutputRange!OutputRange) outputStream.flush; 651 } 652 653 /* Process each line. */ 654 immutable LineBuffered isLineBuffered = cmdopt.lineBuffered ? Yes.lineBuffered : No.lineBuffered; 655 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 656 ulong numLinesWritten = 0; 657 658 foreach (inputStream; cmdopt.inputSources) 659 { 660 if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1); 661 662 foreach (ulong fileLineNum, line; 663 inputStream 664 .file 665 .bufferedByLine!(KeepTerminator.no)(isLineBuffered) 666 .enumerate(fileBodyStartLine)) 667 { 668 if (fileLineNum == 1) throwIfWindowsNewline(line, inputStream.name, fileLineNum); 669 670 immutable double lineScore = uniform01(randomGenerator); 671 672 static if (generateRandomAll) 673 { 674 outputStream.formatRandomValue(lineScore); 675 outputStream.put(cmdopt.delim); 676 outputStream.put(line); 677 outputStream.put("\n"); 678 679 if (cmdopt.sampleSize != 0) 680 { 681 ++numLinesWritten; 682 if (numLinesWritten == cmdopt.sampleSize) return; 683 } 684 } 685 else if (lineScore < cmdopt.inclusionProbability) 686 { 687 if (cmdopt.printRandom) 688 { 689 outputStream.formatRandomValue(lineScore); 690 outputStream.put(cmdopt.delim); 691 } 692 outputStream.put(line); 693 outputStream.put("\n"); 694 695 if (cmdopt.sampleSize != 0) 696 { 697 ++numLinesWritten; 698 if (numLinesWritten == cmdopt.sampleSize) return; 699 } 700 } 701 } 702 } 703 } 704 705 /** bernoulliSkipSampling is an implementation of Bernoulli sampling using skips. 706 * 707 * Skip sampling works by skipping a random number of lines between selections. This 708 * can be faster than assigning a random value to each line when the inclusion 709 * probability is low, as it reduces the number of calls to the random number 710 * generator. Both the random number generator and the log() function are called when 711 * calculating the next skip size. These additional log() calls add up as the 712 * inclusion probability increases. 713 * 714 * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for 715 * file-oriented line sampling. This is obviously environment specific. In the 716 * environments this implementation has been tested in the performance improvements 717 * remain small, less than 7%, even with an inclusion probability as low as 0.0001. 718 * 719 * The algorithm does not assign random values to individual lines. This makes it 720 * incompatible with random value printing. It is not suitable for compatibility mode 721 * either. As an example, in compatibility mode a line selected with '--prob 0.2' should 722 * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling 723 * does not have this property. 724 * 725 * The algorithm for calculating the skip size has been described by multiple sources. 726 * There are two key variants depending on whether the total number of lines in the 727 * data set is known in advance. (This implementation does not know the total.) 728 * Useful references: 729 * $(LIST 730 * * Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling", 731 * ACM Trans on Mathematical Software, 1987. On-line: 732 * http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf 733 * * P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book 734 * "Data Stream Management", Springer-Verlag, 2016. On-line: 735 * https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf 736 * * Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line: 737 * http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/ 738 * ) 739 */ 740 void bernoulliSkipSampling(OutputRange)(ref TsvSampleOptions cmdopt, OutputRange outputStream) 741 if (isOutputRange!(OutputRange, char)) 742 { 743 import std.conv : to; 744 import std.math : log, trunc; 745 import std.random : Random = Mt19937, uniform01; 746 import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, 747 InputSourceRange, LineBuffered, throwIfWindowsNewline; 748 749 assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0); 750 assert(!cmdopt.printRandom); 751 assert(!cmdopt.compatibilityMode); 752 753 assert(!cmdopt.inputSources.empty); 754 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 755 756 auto randomGenerator = Random(cmdopt.seed); 757 758 immutable double discardRate = 1.0 - cmdopt.inclusionProbability; 759 immutable double logDiscardRate = log(discardRate); 760 761 /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed 762 * interval to (0.0, 1.0], excluding 0.0. 763 */ 764 size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t; 765 766 /* First header is read during command line argument processing. */ 767 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 768 { 769 auto inputStream = cmdopt.inputSources.front; 770 771 outputStream.put(inputStream.header); 772 outputStream.put("\n"); 773 774 /* Immediately flush the header so subsequent processes in a unix command 775 * pipeline see it early. This helps provide timely error messages. 776 */ 777 static if (isFlushableOutputRange!OutputRange) outputStream.flush; 778 } 779 780 /* Process each line. */ 781 immutable LineBuffered isLineBuffered = cmdopt.lineBuffered ? Yes.lineBuffered : No.lineBuffered; 782 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 783 ulong numLinesWritten = 0; 784 foreach (inputStream; cmdopt.inputSources) 785 { 786 if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1); 787 788 foreach (ulong fileLineNum, line; 789 inputStream 790 .file 791 .bufferedByLine!(KeepTerminator.no)(isLineBuffered) 792 .enumerate(fileBodyStartLine)) 793 { 794 if (fileLineNum == 1) throwIfWindowsNewline(line, inputStream.name, fileLineNum); 795 796 if (remainingSkips > 0) 797 { 798 --remainingSkips; 799 } 800 else 801 { 802 outputStream.put(line); 803 outputStream.put("\n"); 804 805 if (cmdopt.sampleSize != 0) 806 { 807 ++numLinesWritten; 808 if (numLinesWritten == cmdopt.sampleSize) return; 809 } 810 811 remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t; 812 } 813 } 814 } 815 } 816 817 /** Sample lines by choosing a random set of distinct keys formed from one or more 818 * fields on each line. 819 * 820 * Distinct sampling is a streaming form of sampling, similar to Bernoulli sampling. 821 * However, instead of each line being subject to an independent trial, lines are 822 * selected based on a key from each line. A portion of keys are randomly selected for 823 * output, and every line containing a selected key is included in the output. 824 * 825 * An example use-case is a query log having <user, query, clicked-url> triples. It is 826 * often useful to sample records for portion of the users, but including all records 827 * for the users selected. Distinct sampling supports this by selecting a subset of 828 * users to include in the output. 829 * 830 * Distinct sampling is done by hashing the key and mapping the hash value into 831 * buckets sized to hold the inclusion probability. Records having a key mapping to 832 * bucket zero are output. Buckets are equal size and therefore may be larger than the 833 * inclusion probability. (The other approach would be to have the caller specify the 834 * the number of buckets. More correct, but less convenient.) 835 */ 836 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) 837 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 838 if (isOutputRange!(OutputRange, char)) 839 { 840 import std.algorithm : splitter; 841 import std.conv : to; 842 import std.digest.murmurhash; 843 import std.math : lrint; 844 import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, 845 InputFieldReordering, InputSourceRange, LineBuffered, throwIfWindowsNewline; 846 847 static if (generateRandomAll) assert(cmdopt.genRandomInorder); 848 else assert(!cmdopt.genRandomInorder); 849 850 assert(cmdopt.keyFields.length > 0); 851 assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0); 852 853 assert(!cmdopt.inputSources.empty); 854 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 855 856 static if (generateRandomAll) 857 { 858 import std.format : formatValue, singleSpec; 859 immutable randomValueFormatSpec = singleSpec("%d"); 860 } 861 862 immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys. 863 864 uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint; 865 866 /* Create a mapping for the key fields. */ 867 auto keyFieldsReordering = cmdopt.distinctKeyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields); 868 869 /* First header is read during command line argument processing. */ 870 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 871 { 872 auto inputStream = cmdopt.inputSources.front; 873 874 static if (generateRandomAll) 875 { 876 outputStream.put(cmdopt.randomValueHeader); 877 outputStream.put(cmdopt.delim); 878 } 879 else if (cmdopt.printRandom) 880 { 881 outputStream.put(cmdopt.randomValueHeader); 882 outputStream.put(cmdopt.delim); 883 } 884 885 outputStream.put(inputStream.header); 886 outputStream.put("\n"); 887 888 /* Immediately flush the header so subsequent processes in a unix command 889 * pipeline see it early. This helps provide timely error messages. 890 */ 891 static if (isFlushableOutputRange!OutputRange) outputStream.flush; 892 } 893 894 /* Process each line. */ 895 immutable LineBuffered isLineBuffered = cmdopt.lineBuffered ? Yes.lineBuffered : No.lineBuffered; 896 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 897 ulong numLinesWritten = 0; 898 899 foreach (inputStream; cmdopt.inputSources) 900 { 901 if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1); 902 903 foreach (ulong fileLineNum, line; 904 inputStream 905 .file 906 .bufferedByLine!(KeepTerminator.no)(isLineBuffered) 907 .enumerate(fileBodyStartLine)) 908 { 909 if (fileLineNum == 1) throwIfWindowsNewline(line, inputStream.name, fileLineNum); 910 911 /* Murmurhash works by successively adding individual keys, then finalizing. 912 * Adding individual keys is simpler if the full-line-as-key and individual 913 * fields as keys cases are separated. 914 */ 915 auto hasher = MurmurHash3!32(cmdopt.seed); 916 917 if (cmdopt.distinctKeyIsFullLine) 918 { 919 hasher.put(cast(ubyte[]) line); 920 } 921 else 922 { 923 assert(keyFieldsReordering !is null); 924 925 /* Gather the key field values and assemble the key. */ 926 keyFieldsReordering.initNewLine; 927 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 928 { 929 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 930 if (keyFieldsReordering.allFieldsFilled) break; 931 } 932 933 enforce(keyFieldsReordering.allFieldsFilled, 934 format("Not enough fields in line. File: %s, Line: %s", 935 inputStream.name, fileLineNum)); 936 937 foreach (count, key; keyFieldsReordering.outputFields.enumerate) 938 { 939 if (count > 0) hasher.put(delimArray); 940 hasher.put(cast(ubyte[]) key); 941 } 942 } 943 944 hasher.finish; 945 946 static if (generateRandomAll) 947 { 948 import std.conv : to; 949 outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec); 950 outputStream.put(cmdopt.delim); 951 outputStream.put(line); 952 outputStream.put("\n"); 953 954 if (cmdopt.sampleSize != 0) 955 { 956 ++numLinesWritten; 957 if (numLinesWritten == cmdopt.sampleSize) return; 958 } 959 } 960 else if (hasher.get % numBuckets == 0) 961 { 962 if (cmdopt.printRandom) 963 { 964 outputStream.put('0'); 965 outputStream.put(cmdopt.delim); 966 } 967 outputStream.put(line); 968 outputStream.put("\n"); 969 970 if (cmdopt.sampleSize != 0) 971 { 972 ++numLinesWritten; 973 if (numLinesWritten == cmdopt.sampleSize) return; 974 } 975 } 976 } 977 } 978 } 979 980 /** Random sampling command handler. Invokes the appropriate sampling routine based on 981 * the command line arguments. 982 * 983 * Random sampling selects a fixed size random sample from the input stream. Both 984 * simple random sampling (equal likelihood) and weighted random sampling are 985 * supported. Selected lines are output either in random order or original input order. 986 * For weighted sampling the random order is the weighted selection order. 987 * 988 * Two algorithms are used, reservoir sampling via a heap and reservoir sampling via 989 * Algorithm R. This routine selects the appropriate reservoir sampling function and 990 * template instantiation to based on the command line arguments. 991 * 992 * Weighted sampling always uses the heap approach. Compatibility mode does as well, 993 * as it is the method that uses per-line random value assignments. The implication 994 * of compatibility mode is that a larger sample size includes all the results from 995 * a smaller sample, assuming the same random seed is used. 996 * 997 * For unweighted sampling there is a performance tradeoff between implementations. 998 * Heap-based sampling is faster for small sample sizes. Algorithm R is faster for 999 * large sample sizes. The threshold used was chosen based on performance tests. See 1000 * the reservoirSamplingAlgorithmR documentation for more information. 1001 */ 1002 1003 void randomSamplingCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1004 if (isOutputRange!(OutputRange, char)) 1005 { 1006 assert(cmdopt.sampleSize != 0); 1007 1008 immutable size_t algorithmRSampleSizeThreshold = 128 * 1024; 1009 1010 if (cmdopt.hasWeightField) 1011 { 1012 if (cmdopt.preserveInputOrder) 1013 { 1014 reservoirSamplingViaHeap!(Yes.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream); 1015 } 1016 else 1017 { 1018 reservoirSamplingViaHeap!(Yes.isWeighted, No.preserveInputOrder)(cmdopt, outputStream); 1019 } 1020 } 1021 else if (cmdopt.compatibilityMode || 1022 (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR)) 1023 { 1024 if (cmdopt.preserveInputOrder) 1025 { 1026 reservoirSamplingViaHeap!(No.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream); 1027 } 1028 else 1029 { 1030 reservoirSamplingViaHeap!(No.isWeighted, No.preserveInputOrder)(cmdopt, outputStream); 1031 } 1032 } 1033 else if (cmdopt.preserveInputOrder) 1034 { 1035 reservoirSamplingAlgorithmR!(Yes.preserveInputOrder)(cmdopt, outputStream); 1036 } 1037 else 1038 { 1039 reservoirSamplingAlgorithmR!(No.preserveInputOrder)(cmdopt, outputStream); 1040 } 1041 } 1042 1043 /** Reservoir sampling using a heap. Both weighted and unweighted random sampling are 1044 * supported. 1045 * 1046 * The algorithm used here is based on the one-pass algorithm described by Pavlos 1047 * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S. 1048 * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are 1049 * simply set to one. 1050 * 1051 * The implementation uses a heap (priority queue) large enough to hold the desired 1052 * number of lines. Input is read line-by-line, assigned a random value, and added to 1053 * the heap. The role of the heap is to identify the lines with the highest assigned 1054 * random values. Once the heap is full, adding a new line means dropping the line with 1055 * the lowest score. A "min" heap used for this reason. 1056 * 1057 * When done reading all lines, the "min" heap is in reverse of weighted selection 1058 * order. Weighted selection order is obtained by removing each element one at at time 1059 * from the heap. The underlying data store will have the elements in weighted selection 1060 * order (largest weights first). 1061 * 1062 * Generating output in weighted order is useful for several reasons: 1063 * - For weighted sampling, it preserves the property that smaller valid subsets can be 1064 * created by taking the first N lines. 1065 * - For unweighted sampling, it ensures that all output permutations are possible, and 1066 * are not influenced by input order or the heap data structure used. 1067 * - Order consistency is maintained when making repeated use of the same random seed, 1068 * but with different sample sizes. 1069 * 1070 * The other choice is preserving input order. This is supporting by recording line 1071 * numbers and sorting the selected sample. 1072 * 1073 * There are use cases where only the selection set matters. For these some performance 1074 * could be gained by skipping the reordering and simply printing the backing store 1075 * array in-order. Performance tests indicate only a minor benefit, so this is not 1076 * supported. 1077 * 1078 * Notes: 1079 * $(LIST 1080 * * In tsv-sample versions 1.2.1 and earlier this routine also supported 1081 * randomization of all input lines. This was dropped in version 1.2.2 in favor 1082 * of the approach used in randomizeLines. The latter has significant advantages 1083 * given that all data must be read into memory. 1084 * * For large reservoir sizes better performance can be achieved using Algorithm R. 1085 * See the reservoirSamplingAlgorithmR documentation for details. 1086 * ) 1087 */ 1088 void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, Flag!"preserveInputOrder" preserveInputOrder, OutputRange) 1089 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1090 if (isOutputRange!(OutputRange, char)) 1091 { 1092 import std.algorithm : sort; 1093 import std.container.array; 1094 import std.container.binaryheap; 1095 import std.meta : AliasSeq; 1096 import std.random : Random = Mt19937, uniform01; 1097 import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, 1098 InputSourceRange, throwIfWindowsNewline; 1099 1100 static if (isWeighted) assert(cmdopt.hasWeightField); 1101 else assert(!cmdopt.hasWeightField); 1102 1103 assert(cmdopt.sampleSize > 0); 1104 1105 assert(!cmdopt.inputSources.empty); 1106 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 1107 1108 auto randomGenerator = Random(cmdopt.seed); 1109 1110 static struct Entry(Flag!"preserveInputOrder" preserveInputOrder) 1111 { 1112 double score; 1113 const(char)[] line; 1114 static if (preserveInputOrder) ulong lineNumber; 1115 } 1116 1117 /* Create the heap and backing data store. 1118 * 1119 * Note: An std.container.array is used as the backing store to avoid some issues in 1120 * the standard library (Phobos) binaryheap implementation. Specifically, when an 1121 * std.container.array is used as backing store, the heap can efficiently reversed by 1122 * removing the heap elements. This leaves the backing store in the reversed order. 1123 * However, the current binaryheap implementation does not support this for all 1124 * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094. 1125 */ 1126 1127 Array!(Entry!preserveInputOrder) dataStore; 1128 dataStore.reserve(cmdopt.sampleSize); 1129 auto reservoir = dataStore.heapify!("a.score > b.score")(0); // Min binaryheap 1130 1131 /* First header is read during command line argument processing. */ 1132 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 1133 { 1134 auto inputStream = cmdopt.inputSources.front; 1135 1136 if (cmdopt.printRandom) 1137 { 1138 outputStream.put(cmdopt.randomValueHeader); 1139 outputStream.put(cmdopt.delim); 1140 } 1141 outputStream.put(inputStream.header); 1142 outputStream.put("\n"); 1143 1144 /* Immediately flush the header so subsequent processes in a unix command 1145 * pipeline see it early. This helps provide timely error messages. 1146 */ 1147 static if (isFlushableOutputRange!OutputRange) outputStream.flush; 1148 } 1149 1150 /* Process each line. */ 1151 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 1152 static if (preserveInputOrder) ulong totalLineNum = 0; 1153 1154 foreach (inputStream; cmdopt.inputSources) 1155 { 1156 if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1); 1157 1158 foreach (ulong fileLineNum, line; 1159 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) 1160 { 1161 if (fileLineNum == 1) throwIfWindowsNewline(line, inputStream.name, fileLineNum); 1162 1163 static if (!isWeighted) 1164 { 1165 immutable double lineScore = uniform01(randomGenerator); 1166 } 1167 else 1168 { 1169 immutable double lineWeight = 1170 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, inputStream.name, fileLineNum); 1171 immutable double lineScore = 1172 (lineWeight > 0.0) 1173 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1174 : 0.0; 1175 } 1176 1177 static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum); 1178 else alias entryCTArgs = AliasSeq!(); 1179 1180 if (reservoir.length < cmdopt.sampleSize) 1181 { 1182 reservoir.insert(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs)); 1183 } 1184 else if (reservoir.front.score < lineScore) 1185 { 1186 reservoir.replaceFront(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs)); 1187 } 1188 1189 static if (preserveInputOrder) ++totalLineNum; 1190 } 1191 } 1192 1193 /* Done with input, all entries are in the reservoir. */ 1194 1195 /* The asserts here avoid issues with the current binaryheap implementation. They 1196 * detect use of backing stores having a length not synchronized to the reservoir. 1197 */ 1198 immutable ulong numLines = reservoir.length; 1199 assert(numLines == dataStore.length); 1200 1201 /* Update the backing store so it is in the desired output order. 1202 */ 1203 static if (preserveInputOrder) 1204 { 1205 dataStore[].sort!((a, b) => a.lineNumber < b.lineNumber); 1206 } 1207 else 1208 { 1209 /* Output in weighted selection order. The heap is in reverse order of assigned 1210 * weights. Reversing order is done by removing all elements from the heap. This 1211 * leaves the backing store in the correct order. 1212 */ 1213 while (!reservoir.empty) reservoir.removeFront; 1214 } 1215 1216 assert(numLines == dataStore.length); 1217 1218 foreach (entry; dataStore) 1219 { 1220 if (cmdopt.printRandom) 1221 { 1222 outputStream.formatRandomValue(entry.score); 1223 outputStream.put(cmdopt.delim); 1224 } 1225 outputStream.put(entry.line); 1226 outputStream.put("\n"); 1227 } 1228 } 1229 1230 /** Generate weighted random values for all input lines, preserving input order. 1231 * 1232 * This complements weighted reservoir sampling, but instead of using a reservoir it 1233 * simply iterates over the input lines generating the values. The weighted random 1234 * values are generated with the same formula used by reservoirSampling. 1235 */ 1236 void generateWeightedRandomValuesInorder(OutputRange) 1237 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1238 if (isOutputRange!(OutputRange, char)) 1239 { 1240 import std.random : Random = Mt19937, uniform01; 1241 import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, 1242 InputSourceRange, throwIfWindowsNewline; 1243 1244 assert(cmdopt.hasWeightField); 1245 1246 assert(!cmdopt.inputSources.empty); 1247 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 1248 1249 auto randomGenerator = Random(cmdopt.seed); 1250 1251 /* First header is read during command line argument processing. */ 1252 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 1253 { 1254 auto inputStream = cmdopt.inputSources.front; 1255 1256 outputStream.put(cmdopt.randomValueHeader); 1257 outputStream.put(cmdopt.delim); 1258 outputStream.put(inputStream.header); 1259 outputStream.put("\n"); 1260 1261 /* Immediately flush the header so subsequent processes in a unix command 1262 * pipeline see it early. This helps provide timely error messages. 1263 */ 1264 static if (isFlushableOutputRange!OutputRange) outputStream.flush; 1265 } 1266 1267 /* Process each line. */ 1268 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 1269 ulong numLinesWritten = 0; 1270 1271 foreach (inputStream; cmdopt.inputSources) 1272 { 1273 if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1); 1274 1275 foreach (ulong fileLineNum, line; 1276 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) 1277 { 1278 if (fileLineNum == 1) throwIfWindowsNewline(line, inputStream.name, fileLineNum); 1279 1280 immutable double lineWeight = 1281 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, inputStream.name, fileLineNum); 1282 1283 immutable double lineScore = 1284 (lineWeight > 0.0) 1285 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1286 : 0.0; 1287 1288 outputStream.formatRandomValue(lineScore); 1289 outputStream.put(cmdopt.delim); 1290 outputStream.put(line); 1291 outputStream.put("\n"); 1292 1293 if (cmdopt.sampleSize != 0) 1294 { 1295 ++numLinesWritten; 1296 if (numLinesWritten == cmdopt.sampleSize) return; 1297 } 1298 } 1299 } 1300 } 1301 1302 /** Reservoir sampling via Algorithm R 1303 * 1304 * This is an implementation of reservoir sampling using what is commonly known as 1305 * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of 1306 * Computer Programming, Volume 2: Seminumerical Algorithms". More information about 1307 * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with 1308 * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling" 1309 * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R). 1310 * 1311 * Algorithm R is used for unweighted sampling without replacement. The heap-based 1312 * algorithm in reservoirSamplingViaHeap is used for weighted sampling. 1313 * 1314 * The classic algorithm stops after identifying the selected set of items. This 1315 * implementation goes one step further and randomizes the order of the selected 1316 * lines. This is consistent with shuffling (line order randomization), a primary 1317 * tsv-sample use-case. 1318 * 1319 * This algorithm is faster than reservoirSamplingViaHeap when the sample size 1320 * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size. 1321 * Insertion in this algorithm is O(1). Similarly, generating the random order in the 1322 * heap is O(k * log k), while in this algorithm the final randomization step is O(k). 1323 * 1324 * This speed advantage may be offset a certain amount by using a more expensive random 1325 * value generator. reservoirSamplingViaHeap generates values between zero and one, 1326 * whereas reservoirSamplingAlgorithmR generates random integers over and ever growing 1327 * interval. The latter is expected to be more expensive. This is consistent with 1328 * performance tests indicating that reservoirSamplingViaHeap is faster when using 1329 * small-to-medium size reservoirs and large input streams. 1330 */ 1331 void reservoirSamplingAlgorithmR(Flag!"preserveInputOrder" preserveInputOrder, OutputRange) 1332 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1333 if (isOutputRange!(OutputRange, char)) 1334 { 1335 import std.meta : AliasSeq; 1336 import std.random : Random = Mt19937, randomShuffle, uniform; 1337 import std.algorithm : sort; 1338 import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange, 1339 InputSourceRange, throwIfWindowsNewline; 1340 1341 assert(cmdopt.sampleSize > 0); 1342 assert(!cmdopt.hasWeightField); 1343 assert(!cmdopt.compatibilityMode); 1344 assert(!cmdopt.printRandom); 1345 assert(!cmdopt.genRandomInorder); 1346 1347 assert(!cmdopt.inputSources.empty); 1348 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 1349 1350 static struct Entry(Flag!"preserveInputOrder" preserveInputOrder) 1351 { 1352 const(char)[] line; 1353 static if (preserveInputOrder) ulong lineNumber; 1354 } 1355 1356 Entry!preserveInputOrder[] reservoir; 1357 auto reservoirAppender = appender(&reservoir); 1358 reservoirAppender.reserve(cmdopt.sampleSize); 1359 1360 auto randomGenerator = Random(cmdopt.seed); 1361 1362 /* First header is read during command line argument processing. */ 1363 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 1364 { 1365 auto inputStream = cmdopt.inputSources.front; 1366 1367 outputStream.put(inputStream.header); 1368 outputStream.put("\n"); 1369 1370 /* Immediately flush the header so subsequent processes in a unix command 1371 * pipeline see it early. This helps provide timely error messages. 1372 */ 1373 static if (isFlushableOutputRange!OutputRange) outputStream.flush; 1374 } 1375 1376 /* Process each line. */ 1377 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 1378 ulong totalLineNum = 0; 1379 1380 foreach (inputStream; cmdopt.inputSources) 1381 { 1382 if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1); 1383 1384 foreach (ulong fileLineNum, line; 1385 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) 1386 { 1387 if (fileLineNum == 1) throwIfWindowsNewline(line, inputStream.name, fileLineNum); 1388 1389 /* Add lines to the reservoir until the reservoir is filled. 1390 * After that lines are added with decreasing likelihood, based on 1391 * the total number of lines seen. If added to the reservoir, the 1392 * line replaces a randomly chosen existing line. 1393 */ 1394 static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum); 1395 else alias entryCTArgs = AliasSeq!(); 1396 1397 if (totalLineNum < cmdopt.sampleSize) 1398 { 1399 reservoirAppender ~= Entry!preserveInputOrder(line.idup, entryCTArgs); 1400 } 1401 else 1402 { 1403 immutable size_t i = uniform(0, totalLineNum, randomGenerator); 1404 if (i < reservoir.length) 1405 { 1406 reservoir[i] = Entry!preserveInputOrder(line.idup, entryCTArgs); 1407 } 1408 } 1409 1410 ++totalLineNum; 1411 } 1412 } 1413 1414 /* Done with input. The sample is in the reservoir. Update the order and print. */ 1415 1416 static if (preserveInputOrder) 1417 { 1418 reservoir.sort!((a, b) => a.lineNumber < b.lineNumber); 1419 } 1420 else 1421 { 1422 reservoir.randomShuffle(randomGenerator); 1423 } 1424 1425 foreach (ref entry; reservoir) 1426 { 1427 outputStream.put(entry.line); 1428 outputStream.put("\n"); 1429 } 1430 } 1431 1432 /** Shuffling command handler. Invokes the appropriate shuffle (line order 1433 * randomization) routine based on the command line arguments. 1434 * 1435 * Shuffling has similarities to random sampling, but the algorithms used are 1436 * different. Random sampling selects a subset, only the current subset selection 1437 * needs to be kept in memory. This is supported by reservoir sampling. By contrast, 1438 * shuffling needs to hold all input in memory, so it works better to read all lines 1439 * into memory at once and then shuffle. 1440 * 1441 * Two different algorithms are used. Array shuffling is used for unweighted shuffling. 1442 * Sorting plus random weight assignments is used for weighted shuffling and when 1443 * compatibility mode is being used. 1444 * 1445 * The algorithms used here are all limited by available memory. 1446 */ 1447 void shuffleCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1448 if (isOutputRange!(OutputRange, char)) 1449 { 1450 if (cmdopt.hasWeightField) 1451 { 1452 randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream); 1453 } 1454 else if (cmdopt.compatibilityMode) 1455 { 1456 randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream); 1457 } 1458 else 1459 { 1460 randomizeLinesViaShuffle(cmdopt, outputStream); 1461 } 1462 } 1463 1464 /** Shuffle all input lines by assigning random weights and sorting. 1465 * 1466 * randomizeLinesViaSort reads in all input lines and writes them out in random order. 1467 * The algorithm works by assigning a random value to each line and sorting. Both 1468 * weighted and unweighted shuffling are supported. 1469 * 1470 * Notes: 1471 * $(LIST 1472 * * For unweighted shuffling randomizeLinesViaShuffle is faster and should be used 1473 * unless compatibility mode is needed. 1474 * * This routine is significantly faster than heap-based reservoir sampling in the 1475 * case where the entire file is being read. 1476 * * Input data must be read entirely in memory. Disk oriented techniques are needed 1477 * when data sizes get too large for available memory. One option is to generate 1478 * random values for each line, e.g. --gen-random-inorder, and sort with a disk- 1479 * backed sort program like GNU sort. 1480 * ) 1481 */ 1482 void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange) 1483 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1484 if (isOutputRange!(OutputRange, char)) 1485 { 1486 import std.algorithm : map, sort; 1487 1488 static if (isWeighted) assert(cmdopt.hasWeightField); 1489 else assert(!cmdopt.hasWeightField); 1490 1491 assert(cmdopt.sampleSize == 0); 1492 1493 /* 1494 * Read all file data into memory. Then split the data into lines and assign a 1495 * random value to each line. readFileData also writes the first header line. 1496 */ 1497 const fileData = readFileData!(Yes.hasRandomValue)(cmdopt, outputStream); 1498 auto inputLines = fileData.identifyInputLines!(Yes.hasRandomValue, isWeighted)(cmdopt); 1499 1500 /* 1501 * Sort by the weight and output the lines. 1502 */ 1503 inputLines.sort!((a, b) => a.randomValue > b.randomValue); 1504 1505 foreach (lineEntry; inputLines) 1506 { 1507 if (cmdopt.printRandom) 1508 { 1509 outputStream.formatRandomValue(lineEntry.randomValue); 1510 outputStream.put(cmdopt.delim); 1511 } 1512 outputStream.put(lineEntry.data); 1513 outputStream.put("\n"); 1514 } 1515 } 1516 1517 /** Shuffle (randomize) all input lines using a shuffling algorithm. 1518 * 1519 * All lines in files and/or standard input are read in and written out in random 1520 * order. This routine uses array shuffling, which is faster than sorting. It is a 1521 * good alternative to randomizeLinesViaSort when doing unweighted shuffling (the 1522 * most common case). 1523 * 1524 * Input data size is limited by available memory. Disk oriented techniques are needed 1525 * when data sizes are larger. For example, generating random values line-by-line (ala 1526 * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort. 1527 * 1528 * This routine does not support random value printing or compatibility-mode. 1529 */ 1530 void randomizeLinesViaShuffle(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1531 if (isOutputRange!(OutputRange, char)) 1532 { 1533 import std.algorithm : map; 1534 import std.random : Random = Mt19937, randomShuffle; 1535 1536 assert(cmdopt.sampleSize == 0); 1537 assert(!cmdopt.hasWeightField); 1538 assert(!cmdopt.printRandom); 1539 assert(!cmdopt.genRandomInorder); 1540 1541 /* 1542 * Read all file data into memory and split into lines. 1543 */ 1544 const fileData = readFileData!(No.hasRandomValue)(cmdopt, outputStream); 1545 auto inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt); 1546 1547 /* 1548 * Randomly shuffle and print each line. 1549 * 1550 * Note: Also tried randomCover, but that was exceedingly slow. 1551 */ 1552 import std.random : randomShuffle; 1553 1554 auto randomGenerator = Random(cmdopt.seed); 1555 inputLines.randomShuffle(randomGenerator); 1556 1557 foreach (ref line; inputLines) 1558 { 1559 outputStream.put(line.data); 1560 outputStream.put("\n"); 1561 } 1562 } 1563 1564 /** Simple random sampling with replacement. 1565 * 1566 * All lines in files and/or standard input are read in. Then random lines are selected 1567 * one at a time and output. Lines can be selected multiple times. This process continues 1568 * until the desired number of samples (--n|num) has been output. Output continues 1569 * indefinitely if a sample size was not provided. 1570 */ 1571 void simpleRandomSamplingWithReplacement(OutputRange) 1572 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1573 if (isOutputRange!(OutputRange, char)) 1574 { 1575 import std.algorithm : map; 1576 import std.random : Random = Mt19937, uniform; 1577 1578 /* 1579 * Read all file data into memory and split the data into lines. 1580 */ 1581 const fileData = readFileData!(No.hasRandomValue)(cmdopt, outputStream); 1582 const inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt); 1583 1584 if (inputLines.length > 0) 1585 { 1586 auto randomGenerator = Random(cmdopt.seed); 1587 1588 /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */ 1589 size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize; 1590 while (numLeft != 0) 1591 { 1592 immutable size_t index = uniform(0, inputLines.length, randomGenerator); 1593 outputStream.put(inputLines[index].data); 1594 outputStream.put("\n"); 1595 if (cmdopt.sampleSize != 0) numLeft--; 1596 } 1597 } 1598 } 1599 1600 /** A container holding data read from a file or standard input. 1601 * 1602 * The InputBlock struct is used to represent a block of data read from a file or 1603 * standard input. An array of InputBlocks is returned by readFileData. Typically one 1604 * block per file. Multiple blocks are used for standard input and when the file size 1605 * cannot be determined. Individual lines are not allowed to span blocks. The blocks 1606 * allocated to an individual file are numbered starting with zero. 1607 * 1608 * See readFileData() for more information. 1609 */ 1610 static struct InputBlock 1611 { 1612 string filename; /// Original filename or path. "-" denotes standard input. 1613 size_t fileBlockNumber; /// Zero-based block number for the file. 1614 char[] data; /// The actual data. Newline terminated or last block for the file. 1615 } 1616 1617 /** Read data from one or more files. This routine is used by algorithms needing to 1618 * read all data into memory. 1619 * 1620 * readFileData reads in all data from a set of files. Data is returned as an array 1621 * of InputBlock structs. Normally one InputBlock per file, sized to match the size 1622 * of the file. Standard input is read in one or more blocks, as are files whose size 1623 * cannot be determined. Multiple blocks are used in these last two cases to avoid 1624 * expensive memory reallocations. This is not necessary when file size is known as 1625 * the necessary memory can be preallocated. 1626 * 1627 * Individual lines never span multiple blocks, and newlines are preserved. This 1628 * means that each block starts at the beginning of a line and ends with a newline 1629 * unless the end of a file has been reached. 1630 * 1631 * Each file gets its own block. Prior to using InputSourceRange this was so header 1632 * processing can be done. With InputSourceRange the header is read separately, so 1633 * this could be changed. 1634 */ 1635 InputBlock[] readFileData(HasRandomValue hasRandomValue, OutputRange) 1636 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1637 if (isOutputRange!(OutputRange, char)) 1638 { 1639 import std.algorithm : find, min; 1640 import std.range : retro; 1641 import tsv_utils.common.utils : InputSourceRange, isFlushableOutputRange, 1642 throwIfWindowsNewline; 1643 1644 static if(!hasRandomValue) assert(!cmdopt.printRandom); 1645 1646 assert(!cmdopt.inputSources.empty); 1647 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 1648 1649 /* First header is read during command line argument processing. */ 1650 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 1651 { 1652 auto inputStream = cmdopt.inputSources.front; 1653 1654 if (cmdopt.printRandom) 1655 { 1656 outputStream.put(cmdopt.randomValueHeader); 1657 outputStream.put(cmdopt.delim); 1658 } 1659 outputStream.put(inputStream.header); 1660 outputStream.put("\n"); 1661 1662 /* Immediately flush the header so subsequent processes in a unix command 1663 * pipeline see it early. This helps provide timely error messages. 1664 */ 1665 static if (isFlushableOutputRange!OutputRange) outputStream.flush; 1666 } 1667 1668 enum BlockSize = 1024L * 1024L * 1024L; // 1 GB. ('L' notation avoids overflow w/ 2GB+ sizes.) 1669 enum ReadSize = 1024L * 128L; 1670 enum NewlineSearchSize = 1024L * 16L; 1671 1672 InputBlock[] blocks; 1673 auto blocksAppender = appender(&blocks); 1674 blocksAppender.reserve(cmdopt.inputSources.length); // At least one block per file. 1675 1676 ubyte[] rawReadBuffer = new ubyte[ReadSize]; 1677 1678 foreach (inputStream; cmdopt.inputSources) 1679 { 1680 if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1); 1681 1682 /* If the file size can be determined then read it as a single block. 1683 * Otherwise read as multiple blocks. File.size() returns ulong.max 1684 * if file size cannot be determined, so we'll combine that check 1685 * with the standard input case. 1686 */ 1687 1688 immutable ulong filesize = inputStream.isStdin ? ulong.max : inputStream.file.size; 1689 auto ifile = inputStream.file; 1690 1691 if (filesize != ulong.max) 1692 { 1693 readFileDataAsOneBlock(inputStream.name, ifile, filesize, 1694 blocksAppender, rawReadBuffer); 1695 } 1696 else 1697 { 1698 readFileDataAsMultipleBlocks( 1699 inputStream.name, ifile, blocksAppender, rawReadBuffer, 1700 BlockSize, NewlineSearchSize); 1701 } 1702 } 1703 return blocks; 1704 } 1705 1706 /* readFileData() helper function. Read data from a File handle as a single block. The 1707 * new block is appended to an existing InputBlock[] array. 1708 * 1709 * readFileDataAsOneBlocks is part of the readFileData logic. It handles the case 1710 * where a file is being read as a single block. Normally initialBlockSize is passed 1711 * as the size of the file. 1712 * 1713 * This routine has been separated out to enable unit testing. At present it is not 1714 * intended as a general API. See readFileData for more info. 1715 */ 1716 private void readFileDataAsOneBlock( 1717 string filename, 1718 ref File ifile, 1719 const ulong initialBlockSize, 1720 ref RefAppender!(InputBlock[]) blocksAppender, 1721 ref ubyte[] rawReadBuffer) 1722 { 1723 blocksAppender.put(InputBlock(filename, 0)); 1724 auto dataAppender = appender(&(blocksAppender.data[$-1].data)); 1725 dataAppender.reserve(initialBlockSize); 1726 1727 foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer)) 1728 { 1729 dataAppender.put(cast(char[]) buffer); 1730 } 1731 } 1732 1733 /* readFileData() helper function. Read data from a File handle as one or more blocks. 1734 * Blocks are appended to an existing InputBlock[] array. 1735 * 1736 * readFileDataAsMultipleBlocks is part of the readFileData logic. It handles the case 1737 * where a file or standard input is being read as a series of blocks. This is the 1738 * standard approach for standard input, but also applies when the file size cannot be 1739 * determined. 1740 * 1741 * This routine has been separated out to enable unit testing. At present it is not 1742 * intended as a general API. See readFileData for more info. 1743 */ 1744 private void readFileDataAsMultipleBlocks( 1745 string filename, 1746 ref File ifile, 1747 ref RefAppender!(InputBlock[]) blocksAppender, 1748 ref ubyte[] rawReadBuffer, 1749 const size_t blockSize, 1750 const size_t newlineSearchSize) 1751 { 1752 import std.algorithm : find, min; 1753 import std.range : retro; 1754 1755 assert(ifile.isOpen); 1756 1757 /* Create a new block for the file and an Appender for writing data. 1758 */ 1759 blocksAppender.put(InputBlock(filename, 0)); 1760 auto dataAppender = appender(&(blocksAppender.data[$-1].data)); 1761 dataAppender.reserve(blockSize); 1762 size_t blockNumber = 0; 1763 1764 /* Read all the data and copy it to an InputBlock. */ 1765 foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer)) 1766 { 1767 assert(blockNumber == blocksAppender.data[$-1].fileBlockNumber); 1768 1769 immutable size_t remainingCapacity = dataAppender.capacity - dataAppender.data.length; 1770 1771 if (buffer.length <= remainingCapacity) 1772 { 1773 dataAppender.put(cast(char[]) buffer); 1774 } 1775 else 1776 { 1777 /* Look for the last newline in the input buffer that fits in remaining 1778 * capacity of the block. 1779 */ 1780 auto searchRegion = buffer[0 .. remainingCapacity]; 1781 auto appendRegion = searchRegion.retro.find('\n').source; 1782 1783 if (appendRegion.length > 0) 1784 { 1785 /* Copy the first part of the read buffer to the block. */ 1786 dataAppender.put(cast(char[]) appendRegion); 1787 1788 /* Create a new InputBlock and copy the remaining data to it. */ 1789 blockNumber++; 1790 blocksAppender.put(InputBlock(filename, blockNumber)); 1791 dataAppender = appender(&(blocksAppender.data[$-1].data)); 1792 dataAppender.reserve(blockSize); 1793 dataAppender.put(cast(char[]) buffer[appendRegion.length .. $]); 1794 1795 assert(blocksAppender.data.length >= 2); 1796 assert(blocksAppender.data[$-2].data[$-1] == '\n'); 1797 } 1798 else 1799 { 1800 /* Search backward in the current block for a newline. If found, it 1801 * becomes the last newline in the current block. Anything following 1802 * it is moved to the block. If a newline is not found, simply append 1803 * to the current block and let it grow. We'll only search backward 1804 * so far. 1805 */ 1806 immutable size_t currBlockLength = blocksAppender.data[$-1].data.length; 1807 immutable size_t searchLength = min(currBlockLength, newlineSearchSize); 1808 immutable size_t searchStart = currBlockLength - searchLength; 1809 auto blockSearchRegion = blocksAppender.data[$-1].data[searchStart .. $]; 1810 auto lastNewlineOffset = blockSearchRegion.retro.find('\n').source.length; 1811 1812 if (lastNewlineOffset != 0) 1813 { 1814 /* Create a new InputBlock. The previous InputBlock is then found 1815 * at blocksAppender.data[$-2]. It may be a physically different 1816 * struct (a copy) if the blocks array gets reallocated. 1817 */ 1818 blockNumber++; 1819 blocksAppender.put(InputBlock(filename, blockNumber)); 1820 dataAppender = appender(&(blocksAppender.data[$-1].data)); 1821 dataAppender.reserve(blockSize); 1822 1823 /* Copy data following the newline from the last block to the new 1824 * block. Then append the current read buffer. 1825 */ 1826 immutable size_t moveRegionStart = searchStart + lastNewlineOffset; 1827 dataAppender.put(blocksAppender.data[$-2].data[moveRegionStart .. $]); 1828 dataAppender.put(cast(char[]) buffer); 1829 1830 /* Now delete the moved region from the last block. */ 1831 blocksAppender.data[$-2].data.length = moveRegionStart; 1832 1833 assert(blocksAppender.data.length >= 2); 1834 assert(blocksAppender.data[$-2].data[$-1] == '\n'); 1835 } 1836 else 1837 { 1838 /* Give up. Allow the current block to grow. */ 1839 dataAppender.put(cast(char[]) buffer); 1840 } 1841 } 1842 } 1843 } 1844 } 1845 1846 /** HasRandomValue is a boolean flag used at compile time by identifyInputLines to 1847 * distinguish use cases needing random value assignments from those that don't. 1848 */ 1849 alias HasRandomValue = Flag!"hasRandomValue"; 1850 1851 /** An InputLine array is returned by identifyInputLines to represent each non-header line 1852 * line found in a FileData array. The 'data' element contains the line. A 'randomValue' 1853 * line is included if random values are being generated. 1854 */ 1855 static struct InputLine(HasRandomValue hasRandomValue) 1856 { 1857 const(char)[] data; 1858 static if (hasRandomValue) double randomValue; 1859 } 1860 1861 /** identifyInputLines is used by algorithms that read all files into memory prior to 1862 * processing. It does the initial processing of the file data. 1863 * 1864 * Two main tasks are performed. One is splitting all input data into lines. The second 1865 * is assigning a random value to the line, if random values are being generated. 1866 * 1867 * The key input is an InputBlock array. Normally one block for each file, but standard 1868 * input may have multiple blocks. 1869 * 1870 * The return value is an array of InputLine structs. The struct will have a 'randomValue' 1871 * member if random values are being assigned. 1872 */ 1873 InputLine!hasRandomValue[] identifyInputLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted) 1874 (const ref InputBlock[] inputBlocks, ref TsvSampleOptions cmdopt) 1875 { 1876 import std.algorithm : splitter; 1877 import std.array : appender; 1878 import std.random : Random = Mt19937, uniform01; 1879 import tsv_utils.common.utils : throwIfWindowsNewline; 1880 1881 static assert(hasRandomValue || !isWeighted); 1882 static if(!hasRandomValue) assert(!cmdopt.printRandom); 1883 1884 InputLine!hasRandomValue[] inputLines; 1885 1886 auto linesAppender = appender(&inputLines); 1887 static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed); 1888 1889 /* Note: fileLineNum is zero-based here. One-based in most other code in this file. */ 1890 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 1 : 0; 1891 size_t fileLineNum = fileBodyStartLine; 1892 1893 foreach (block; inputBlocks) 1894 { 1895 /* Drop the last newline to avoid adding an extra empty line. */ 1896 const data = (block.data.length > 0 && block.data[$-1] == '\n') ? 1897 block.data[0 .. $-1] : block.data; 1898 1899 if (block.fileBlockNumber == 0) fileLineNum = fileBodyStartLine; 1900 1901 foreach (ref line; data.splitter('\n')) 1902 { 1903 fileLineNum++; 1904 1905 if (fileLineNum == 1) throwIfWindowsNewline(line, block.filename, fileLineNum); 1906 1907 static if (!hasRandomValue) 1908 { 1909 linesAppender.put(InputLine!hasRandomValue(line)); 1910 } 1911 else 1912 { 1913 static if (!isWeighted) 1914 { 1915 immutable double randomValue = uniform01(randomGenerator); 1916 } 1917 else 1918 { 1919 immutable double lineWeight = 1920 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, 1921 block.filename, fileLineNum); 1922 immutable double randomValue = 1923 (lineWeight > 0.0) 1924 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1925 : 0.0; 1926 } 1927 1928 linesAppender.put(InputLine!hasRandomValue(line, randomValue)); 1929 } 1930 } 1931 } 1932 1933 return inputLines; 1934 } 1935 1936 1937 /* Unit tests for ReadFileData. These tests focus on multiple InputBlock scenarios. 1938 * Other use paths are well tested by the tests at the end cases. 1939 */ 1940 unittest 1941 { 1942 import tsv_utils.common.unittest_utils; 1943 import std.algorithm : equal, find, joiner, splitter; 1944 import std.array : appender; 1945 import std.file : rmdirRecurse; 1946 import std.path : buildPath; 1947 import std.range : repeat; 1948 1949 auto rfdTestDir = makeUnittestTempDir("tsv_sample_readFileData"); 1950 scope(exit) rfdTestDir.rmdirRecurse; 1951 1952 char[] file1Data; 1953 char[] file2Data; 1954 char[] file3Data; 1955 1956 auto app1 = appender(&file1Data); 1957 auto app2 = appender(&file2Data); 1958 auto app3 = appender(&file3Data); 1959 1960 /* File 1: 1000 short lines. */ 1961 app1.put("\n".repeat(100).joiner); 1962 app1.put("x\n".repeat(100).joiner); 1963 app1.put("yz\n".repeat(100).joiner); 1964 app1.put("pqr\n".repeat(100).joiner); 1965 app1.put("a\nbc\ndef\n".repeat(100).joiner); 1966 app1.put('\n'.repeat(100)); 1967 app1.put("z\n".repeat(100).joiner); 1968 app1.put("xy\n".repeat(100).joiner); 1969 1970 /* File 2: 500 longer lines. */ 1971 app2.put( 1972 "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n" 1973 .repeat(100) 1974 .joiner); 1975 app2.put( 1976 "|abcdefghijklmnopqrstuv|\n|0123456789|\n|0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ|\n|abcdefghijklmnopqrstuvwxyz|\n" 1977 .repeat(100) 1978 .joiner); 1979 app2.put( 1980 "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n" 1981 .repeat(100) 1982 .joiner); 1983 1984 /* File 3: 1000 mixed length lines. */ 1985 app3.put("\n\n|abcde|\n1\n12\n123\n|abcdefghijklmnop|\n|xyz|\n0123456789\nX\n".repeat(100).joiner); 1986 1987 string file1Path = buildPath(rfdTestDir, "file1.txt"); 1988 string file2Path = buildPath(rfdTestDir, "file2.txt"); 1989 string file3Path = buildPath(rfdTestDir, "file3.txt"); 1990 1991 try 1992 { 1993 auto ofile1 = File(file1Path, "wb"); 1994 ofile1.write(file1Data); 1995 ofile1.close; 1996 } 1997 catch (Exception e) assert(false, format("Failed to write file: %s.\n Error: %s", file1Path, e.msg)); 1998 1999 try 2000 { 2001 auto ofile2 = File(file2Path, "wb"); 2002 ofile2.write(file2Data); 2003 ofile2.close; 2004 } 2005 catch (Exception e) assert(false, format("Failed to write file: %s.\n Error: %s", file2Path, e.msg)); 2006 2007 try 2008 { 2009 auto ofile3 = File(file3Path, "wb"); 2010 ofile3.write(file3Data); 2011 ofile3.close; 2012 } 2013 catch (Exception e) assert(false, format("Failed to write file: %s.\n Error: %s", file3Path, e.msg)); 2014 2015 auto allData = file1Data ~ file2Data ~ file3Data; 2016 auto expectedLines = allData.splitter('\n').array[0 .. $-1]; 2017 2018 auto file2DataNoHeader = (file2Data.find('\n'))[1 .. $]; 2019 auto file3DataNoHeader = (file3Data.find('\n'))[1 .. $]; 2020 auto allDataUsingHeader = file1Data ~ file2DataNoHeader ~ file3DataNoHeader; 2021 auto expectedLinesUsingHeader = allDataUsingHeader.splitter('\n').array[0 .. $-1]; 2022 2023 assert(expectedLines.length == expectedLinesUsingHeader.length + 2); 2024 2025 /* We need real files for creating command line arg structs. 2026 */ 2027 string file1Copy1Path = buildPath(rfdTestDir, "file1_copy1.txt"); 2028 string file1Copy2Path = buildPath(rfdTestDir, "file1_copy2.txt"); 2029 2030 try 2031 { 2032 auto ofile = File(file1Copy1Path, "wb"); 2033 ofile.write(file1Data); 2034 ofile.close; 2035 } 2036 catch (Exception e) assert(false, format("Failed to write file: %s.\n Error: %s", file1Copy1Path, e.msg)); 2037 2038 try 2039 { 2040 auto ofile = File(file1Copy2Path, "wb"); 2041 ofile.write(file1Data); 2042 ofile.close; 2043 } 2044 catch (Exception e) assert(false, format("Failed to write file: %s.\n Error: %s", file1Copy2Path, e.msg)); 2045 2046 TsvSampleOptions cmdoptNoHeader; 2047 auto noHeaderCmdArgs = ["unittest", file1Copy1Path]; 2048 auto r1 = cmdoptNoHeader.processArgs(noHeaderCmdArgs); 2049 assert(r1[0], format("Invalid command lines arg: '%s'.", noHeaderCmdArgs)); 2050 2051 TsvSampleOptions cmdoptYesHeader; 2052 auto yesHeaderCmdArgs = ["unittest", "--header", file1Copy2Path]; 2053 auto r2 = cmdoptYesHeader.processArgs(yesHeaderCmdArgs); 2054 assert(r2[0], format("Invalid command lines arg: '%s'.", yesHeaderCmdArgs)); 2055 2056 scope (exit) 2057 { 2058 /* Close the files being used by the cmdopt[yes|no]Header structs. */ 2059 while (!cmdoptNoHeader.inputSources.empty) cmdoptNoHeader.inputSources.popFront; 2060 while (!cmdoptYesHeader.inputSources.empty) cmdoptYesHeader.inputSources.popFront; 2061 } 2062 2063 auto outputStream = appender!(char[])(); 2064 2065 { 2066 /* Reading as single blocks. */ 2067 ubyte[] rawReadBuffer = new ubyte[256]; 2068 InputBlock[] blocks; 2069 auto blocksAppender = appender(&blocks); 2070 blocksAppender.reserve(3); 2071 foreach (f; [ file1Path, file2Path, file3Path ]) 2072 { 2073 auto ifile = f.File("rb"); 2074 ulong filesize = ifile.size; 2075 if (filesize == ulong.max) filesize = 1000; 2076 readFileDataAsOneBlock(f, ifile, filesize, blocksAppender, rawReadBuffer); 2077 ifile.close; 2078 } 2079 auto inputLines = 2080 identifyInputLines!(No.hasRandomValue, No.isWeighted)( 2081 blocks, cmdoptNoHeader); 2082 2083 assert(equal!((a, b) => a.data == b)(inputLines, expectedLines)); 2084 } 2085 2086 { 2087 /* Reading as multiple blocks. */ 2088 foreach (size_t searchSize; [ 0, 1, 2, 64 ]) 2089 { 2090 foreach (size_t blockSize; [ 1, 2, 16, 64, 256 ]) 2091 { 2092 foreach (size_t readSize; [ 1, 2, 8, 32 ]) 2093 { 2094 ubyte[] rawReadBuffer = new ubyte[readSize]; 2095 InputBlock[] blocks; 2096 auto blocksAppender = appender(&blocks); 2097 blocksAppender.reserve(3); 2098 foreach (f; [ file1Path, file2Path, file3Path ]) 2099 { 2100 auto ifile = f.File("rb"); 2101 readFileDataAsMultipleBlocks(f, ifile, blocksAppender, 2102 rawReadBuffer, blockSize, searchSize); 2103 ifile.close; 2104 } 2105 auto inputLines = 2106 identifyInputLines!(No.hasRandomValue, No.isWeighted)( 2107 blocks, cmdoptNoHeader); 2108 2109 assert(equal!((a, b) => a.data == b)(inputLines, expectedLines)); 2110 } 2111 } 2112 } 2113 } 2114 version(none) { 2115 { 2116 /* Reading as multiple blocks, with header processing. */ 2117 const size_t readSize = 32; 2118 const size_t blockSize = 48; 2119 const size_t searchSize = 16; 2120 2121 ubyte[] rawReadBuffer = new ubyte[readSize]; 2122 InputBlock[] blocks; 2123 auto blocksAppender = appender(&blocks); 2124 blocksAppender.reserve(3); 2125 foreach (f; [ file1Path, file2Path, file3Path ]) 2126 { 2127 auto ifile = f.File("rb"); 2128 readFileDataAsMultipleBlocks(f, ifile, blocksAppender, 2129 rawReadBuffer, blockSize, searchSize); 2130 ifile.close; 2131 } 2132 auto inputLines = 2133 identifyInputLines!(No.hasRandomValue, No.isWeighted)( 2134 blocks, cmdoptYesHeader); 2135 2136 assert(outputStream.data == expectedLinesUsingHeader[0] ~ '\n'); 2137 assert(equal!((a, b) => a.data == b)(inputLines, expectedLinesUsingHeader[1 .. $])); 2138 } 2139 } 2140 } 2141 2142 /** Write a floating point random value to an output stream. 2143 * 2144 * This routine is used for floating point random value printing. This routine writes 2145 * 17 significant digits, the range available in doubles. This routine prefers decimal 2146 * format, without exponents. It will generate somewhat large precision numbers, 2147 * currently up to 28 digits, before switching to exponents. 2148 * 2149 * The primary reason for this approach is to enable faster sorting on random values 2150 * by GNU sort and similar external sorting programs. GNU sort is dramatically faster 2151 * on decimal format numeric sorts ('n' switch) than general numeric sorts ('g' switch). 2152 * The 'general numeric' handles exponential notation. The difference is 5-10x. 2153 * 2154 * Random values generated by Bernoulli sampling are nearly always greater than 1e-12. 2155 * No examples less than 1e-09 were seen in hundred of millions of trials. Similar 2156 * results were seen with weighted sampling with integer weights. The same is not true 2157 * with floating point weights. These produce quite large exponents. However, even 2158 * for floating point weights this can be useful. For random weights [0,1] less than 5% 2159 * will be less than 1e-12 and use exponential notation. 2160 */ 2161 void formatRandomValue(OutputRange)(auto ref OutputRange outputStream, double value) 2162 if (isOutputRange!(OutputRange, char)) 2163 { 2164 import std.format : formatValue, singleSpec; 2165 2166 immutable spec17f = singleSpec("%.17f"); 2167 immutable spec18f = singleSpec("%.18f"); 2168 immutable spec19f = singleSpec("%.19f"); 2169 immutable spec20f = singleSpec("%.20f"); 2170 immutable spec21f = singleSpec("%.21f"); 2171 immutable spec22f = singleSpec("%.22f"); 2172 immutable spec23f = singleSpec("%.23f"); 2173 immutable spec24f = singleSpec("%.24f"); 2174 immutable spec25f = singleSpec("%.25f"); 2175 immutable spec26f = singleSpec("%.26f"); 2176 immutable spec27f = singleSpec("%.27f"); 2177 immutable spec28f = singleSpec("%.28f"); 2178 2179 immutable spec17g = singleSpec("%.17g"); 2180 2181 immutable formatSpec = 2182 (value >= 1e-01) ? spec17f : 2183 (value >= 1e-02) ? spec18f : 2184 (value >= 1e-03) ? spec19f : 2185 (value >= 1e-04) ? spec20f : 2186 (value >= 1e-05) ? spec21f : 2187 (value >= 1e-06) ? spec22f : 2188 (value >= 1e-07) ? spec23f : 2189 (value >= 1e-08) ? spec24f : 2190 (value >= 1e-09) ? spec25f : 2191 (value >= 1e-10) ? spec26f : 2192 (value >= 1e-11) ? spec27f : 2193 (value >= 1e-12) ? spec28f : spec17g; 2194 2195 outputStream.formatValue(value, formatSpec); 2196 } 2197 2198 @safe unittest 2199 { 2200 void testFormatValue(double value, string expected) 2201 { 2202 import std.array : appender; 2203 2204 auto s = appender!string(); 2205 s.formatRandomValue(value); 2206 assert(s.data == expected, 2207 format("[testFormatValue] value: %g; expected: %s; actual: %s", value, expected, s.data)); 2208 } 2209 2210 testFormatValue(1.0, "1.00000000000000000"); 2211 testFormatValue(0.1, "0.10000000000000001"); 2212 testFormatValue(0.01, "0.010000000000000000"); 2213 testFormatValue(1e-03, "0.0010000000000000000"); 2214 testFormatValue(1e-04, "0.00010000000000000000"); 2215 testFormatValue(1e-05, "0.000010000000000000001"); 2216 testFormatValue(1e-06, "0.0000010000000000000000"); 2217 testFormatValue(1e-07, "0.00000010000000000000000"); 2218 testFormatValue(1e-08, "0.000000010000000000000000"); 2219 testFormatValue(1e-09, "0.0000000010000000000000001"); 2220 testFormatValue(1e-10, "0.00000000010000000000000000"); 2221 testFormatValue(1e-11, "0.000000000009999999999999999"); 2222 testFormatValue(1e-12, "0.0000000000010000000000000000"); 2223 testFormatValue(1e-13, "1e-13"); 2224 testFormatValue(1e-14, "1e-14"); 2225 testFormatValue(12345678901234567e-15, "12.34567890123456735"); 2226 testFormatValue(12345678901234567e-16, "1.23456789012345669"); 2227 testFormatValue(12345678901234567e-17, "0.12345678901234566"); 2228 testFormatValue(12345678901234567e-18, "0.012345678901234567"); 2229 testFormatValue(12345678901234567e-19, "0.0012345678901234567"); 2230 testFormatValue(12345678901234567e-20, "0.00012345678901234567"); 2231 testFormatValue(12345678901234567e-21, "0.000012345678901234568"); 2232 testFormatValue(12345678901234567e-22, "0.0000012345678901234567"); 2233 testFormatValue(12345678901234567e-23, "0.00000012345678901234566"); 2234 testFormatValue(12345678901234567e-24, "0.000000012345678901234567"); 2235 testFormatValue(12345678901234567e-25, "0.0000000012345678901234566"); 2236 testFormatValue(12345678901234567e-26, "0.00000000012345678901234568"); 2237 testFormatValue(12345678901234567e-27, "0.000000000012345678901234567"); 2238 testFormatValue(12345678901234567e-28, "0.0000000000012345678901234567"); 2239 testFormatValue(12345678901234567e-29, "1.2345678901234566e-13"); 2240 } 2241 2242 /** Convenience function for extracting a single field from a line. See 2243 * [tsv_utils.common.utils.getTsvFieldValue] for details. This wrapper creates error 2244 * text tailored for this program. 2245 */ 2246 import std.traits : isSomeChar; 2247 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, ulong lineNum) pure @safe 2248 if (isSomeChar!C) 2249 { 2250 import std.conv : ConvException, to; 2251 import tsv_utils.common.utils : getTsvFieldValue; 2252 2253 T val; 2254 try 2255 { 2256 val = getTsvFieldValue!T(line, fieldIndex, delim); 2257 } 2258 catch (ConvException exc) 2259 { 2260 throw new Exception( 2261 format("Could not process line: %s\n File: %s Line: %s%s", 2262 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 2263 (lineNum == 1) ? "\n Is this a header line? Use --H|header to skip." : "")); 2264 } 2265 catch (Exception exc) 2266 { 2267 /* Not enough fields on the line. */ 2268 throw new Exception( 2269 format("Could not process line: %s\n File: %s Line: %s", 2270 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum)); 2271 } 2272 2273 return val; 2274 } 2275 2276 @safe unittest 2277 { 2278 /* getFieldValue unit tests. getTsvFieldValue has it's own tests. 2279 * These tests make basic sanity checks on the getFieldValue wrapper. 2280 */ 2281 import std.exception; 2282 2283 assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123); 2284 assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4); 2285 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1)); 2286 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2)); 2287 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1)); 2288 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2)); 2289 } 2290 2291 /* Unit tests for the main program start here. 2292 * 2293 * Portability note: Many of the tests here rely on generating consistent random numbers 2294 * across different platforms when using the same random seed. So far this has succeeded 2295 * on several different platform, compiler, and library versions. However, it is certainly 2296 * possible this condition will not hold on other platforms. 2297 * 2298 * For tsv-sample, this portability implies generating the same results on different 2299 * platforms when using the same random seed. This is NOT part of tsv-sample guarantees, 2300 * but it is convenient for testing. If platforms are identified that do not generate 2301 * the same results these tests will need to be adjusted. 2302 */ 2303 version(unittest) 2304 { 2305 /* Unit test helper functions. */ 2306 2307 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 2308 import std.conv : to; 2309 2310 void testTsvSample(string[] cmdArgs, string[][] expected) 2311 { 2312 import std.array : appender; 2313 2314 assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty."); 2315 2316 auto formatAssertMessage(T...)(string msg, T formatArgs) 2317 { 2318 auto formatString = "[testTsvSample] %s: " ~ msg; 2319 return format(formatString, cmdArgs[0], formatArgs); 2320 } 2321 2322 TsvSampleOptions cmdopt; 2323 auto savedCmdArgs = cmdArgs.to!string; 2324 auto r = cmdopt.processArgs(cmdArgs); 2325 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 2326 auto output = appender!(char[])(); 2327 2328 tsvSample(cmdopt, output); // This invokes the main code line. 2329 2330 auto expectedOutput = expected.tsvDataToString; 2331 2332 assert(output.data == expectedOutput, 2333 formatAssertMessage( 2334 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 2335 expectedOutput.to!string, output.data.to!string)); 2336 } 2337 } 2338 2339 unittest 2340 { 2341 import std.path : buildPath; 2342 import std.file : rmdirRecurse; 2343 2344 auto testDir = makeUnittestTempDir("tsv_sample"); 2345 scope(exit) testDir.rmdirRecurse; 2346 2347 /* Tabular data sets and expected results use the built-in static seed. 2348 * Tests are run by writing the data set to a file, then calling the main 2349 * routine to process. The function testTsvSample plays the role of the 2350 * main program. Rather than writing to expected output, the results are 2351 * matched against expected. The expected results were verified by hand 2352 * prior to inclusion in the test. 2353 * 2354 * The initial part of this section is simply setting up data files and 2355 * expected results. 2356 * 2357 * Expected results naming conventions: 2358 * - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected 2359 * - Sampling Type (required): Permute (Shuffle), Sample, Replace, Bernoulli, Distinct 2360 * - Compatibility: Compat, AlgoR, Skip, Swap, Inorder 2361 * - Weight Field: Wt<num>, e.g. Wt3 2362 * - Sample Size: Num<num>, eg. Num3 2363 * - Seed Value: V<num>, eg. V77 2364 * - Key Field: K<num>, e.g. K2 2365 * - Probability: P<num>, e.g P05 (5%) 2366 * - Printing Probabilities: Probs 2367 * - Printing Probs in order: ProbsInorder 2368 * - Printing Probs with custom header: RVCustom 2369 */ 2370 2371 /* Empty file. */ 2372 string[][] dataEmpty = []; 2373 string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv"); 2374 writeUnittestTsvFile(fpath_dataEmpty, dataEmpty); 2375 2376 /* 3x0, header only. */ 2377 string[][] data3x0 = [["field_a", "field_b", "field_c"]]; 2378 string fpath_data3x0 = buildPath(testDir, "data3x0.tsv"); 2379 writeUnittestTsvFile(fpath_data3x0, data3x0); 2380 2381 /* 3x1 */ 2382 string[][] data3x1 = 2383 [["field_a", "field_b", "field_c"], 2384 ["tan", "タン", "8.5"]]; 2385 2386 string fpath_data3x1 = buildPath(testDir, "data3x1.tsv"); 2387 string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv"); 2388 writeUnittestTsvFile(fpath_data3x1, data3x1); 2389 writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1 .. $]); 2390 2391 string[][] data3x1ExpectedReplaceNum3 = 2392 [["field_a", "field_b", "field_c"], 2393 ["tan", "タン", "8.5"], 2394 ["tan", "タン", "8.5"], 2395 ["tan", "タン", "8.5"]]; 2396 2397 /* 3x2 */ 2398 string[][] data3x2 = 2399 [["field_a", "field_b", "field_c"], 2400 ["brown", "褐色", "29.2"], 2401 ["gray", "グレー", "6.2"]]; 2402 2403 string fpath_data3x2 = buildPath(testDir, "data3x2.tsv"); 2404 string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv"); 2405 writeUnittestTsvFile(fpath_data3x2, data3x2); 2406 writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1 .. $]); 2407 2408 string[][] data3x2PermuteCompat = 2409 [["field_a", "field_b", "field_c"], 2410 ["gray", "グレー", "6.2"], 2411 ["brown", "褐色", "29.2"]]; 2412 2413 string[][] data3x2PermuteShuffle = 2414 [["field_a", "field_b", "field_c"], 2415 ["gray", "グレー", "6.2"], 2416 ["brown", "褐色", "29.2"]]; 2417 2418 /* 3x3 */ 2419 string[][] data3x3 = 2420 [["field_a", "field_b", "field_c"], 2421 ["orange", "オレンジ", "2.5"], 2422 ["pink", "ピンク", "1.1"], 2423 ["purple", "紫の", "42"]]; 2424 2425 string fpath_data3x3 = buildPath(testDir, "data3x3.tsv"); 2426 string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv"); 2427 writeUnittestTsvFile(fpath_data3x3, data3x3); 2428 writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1 .. $]); 2429 2430 string[][] data3x3ExpectedPermuteCompat = 2431 [["field_a", "field_b", "field_c"], 2432 ["purple", "紫の", "42"], 2433 ["pink", "ピンク", "1.1"], 2434 ["orange", "オレンジ", "2.5"]]; 2435 2436 string[][] data3x3ExpectedPermuteSwap = 2437 [["field_a", "field_b", "field_c"], 2438 ["purple", "紫の", "42"], 2439 ["orange", "オレンジ", "2.5"], 2440 ["pink", "ピンク", "1.1"]]; 2441 2442 /* 3x6 */ 2443 string[][] data3x6 = 2444 [["field_a", "field_b", "field_c"], 2445 ["red", "赤", "23.8"], 2446 ["green", "緑", "0.0072"], 2447 ["white", "白", "1.65"], 2448 ["yellow", "黄", "12"], 2449 ["blue", "青", "12"], 2450 ["black", "黒", "0.983"]]; 2451 string fpath_data3x6 = buildPath(testDir, "data3x6.tsv"); 2452 string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv"); 2453 writeUnittestTsvFile(fpath_data3x6, data3x6); 2454 writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1 .. $]); 2455 2456 // Randomization, all lines 2457 string[][] data3x6ExpectedPermuteCompat = 2458 [["field_a", "field_b", "field_c"], 2459 ["yellow", "黄", "12"], 2460 ["black", "黒", "0.983"], 2461 ["blue", "青", "12"], 2462 ["white", "白", "1.65"], 2463 ["green", "緑", "0.0072"], 2464 ["red", "赤", "23.8"]]; 2465 2466 string[][] data3x6ExpectedPermuteSwap = 2467 [["field_a", "field_b", "field_c"], 2468 ["black", "黒", "0.983"], 2469 ["green", "緑", "0.0072"], 2470 ["red", "赤", "23.8"], 2471 ["yellow", "黄", "12"], 2472 ["white", "白", "1.65"], 2473 ["blue", "青", "12"]]; 2474 2475 string[][] data3x6ExpectedPermuteCompatProbs = 2476 [["random_value", "field_a", "field_b", "field_c"], 2477 ["0.96055546286515892", "yellow", "黄", "12"], 2478 ["0.75710153928957880", "black", "黒", "0.983"], 2479 ["0.52525980887003243", "blue", "青", "12"], 2480 ["0.49287854949943721", "white", "白", "1.65"], 2481 ["0.15929344086907804", "green", "緑", "0.0072"], 2482 ["0.010968807619065046", "red", "赤", "23.8"]]; 2483 2484 /* Note: data3x6ExpectedSampleAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because 2485 * both are effectively the same algorithm given that --num is data length. Both read 2486 * in the full data in order then call randomShuffle. 2487 */ 2488 string[][] data3x6ExpectedSampleAlgoRNum6 = 2489 [["field_a", "field_b", "field_c"], 2490 ["black", "黒", "0.983"], 2491 ["green", "緑", "0.0072"], 2492 ["red", "赤", "23.8"], 2493 ["yellow", "黄", "12"], 2494 ["white", "白", "1.65"], 2495 ["blue", "青", "12"]]; 2496 2497 string[][] data3x6ExpectedSampleAlgoRNum5 = 2498 [["field_a", "field_b", "field_c"], 2499 ["red", "赤", "23.8"], 2500 ["black", "黒", "0.983"], 2501 ["white", "白", "1.65"], 2502 ["green", "緑", "0.0072"], 2503 ["yellow", "黄", "12"]]; 2504 2505 string[][] data3x6ExpectedSampleAlgoRNum4 = 2506 [["field_a", "field_b", "field_c"], 2507 ["blue", "青", "12"], 2508 ["green", "緑", "0.0072"], 2509 ["black", "黒", "0.983"], 2510 ["white", "白", "1.65"]]; 2511 2512 string[][] data3x6ExpectedSampleAlgoRNum3 = 2513 [["field_a", "field_b", "field_c"], 2514 ["red", "赤", "23.8"], 2515 ["black", "黒", "0.983"], 2516 ["green", "緑", "0.0072"]]; 2517 2518 string[][] data3x6ExpectedSampleAlgoRNum2 = 2519 [["field_a", "field_b", "field_c"], 2520 ["black", "黒", "0.983"], 2521 ["red", "赤", "23.8"]]; 2522 2523 string[][] data3x6ExpectedSampleAlgoRNum1 = 2524 [["field_a", "field_b", "field_c"], 2525 ["green", "緑", "0.0072"]]; 2526 2527 /* Inorder versions. */ 2528 string[][] data3x6ExpectedSampleAlgoRNum6Inorder = 2529 [["field_a", "field_b", "field_c"], 2530 ["red", "赤", "23.8"], 2531 ["green", "緑", "0.0072"], 2532 ["white", "白", "1.65"], 2533 ["yellow", "黄", "12"], 2534 ["blue", "青", "12"], 2535 ["black", "黒", "0.983"]]; 2536 2537 string[][] data3x6ExpectedSampleAlgoRNum5Inorder = 2538 [["field_a", "field_b", "field_c"], 2539 ["red", "赤", "23.8"], 2540 ["green", "緑", "0.0072"], 2541 ["white", "白", "1.65"], 2542 ["yellow", "黄", "12"], 2543 ["black", "黒", "0.983"]]; 2544 2545 string[][] data3x6ExpectedSampleAlgoRNum4Inorder = 2546 [["field_a", "field_b", "field_c"], 2547 ["green", "緑", "0.0072"], 2548 ["white", "白", "1.65"], 2549 ["blue", "青", "12"], 2550 ["black", "黒", "0.983"]]; 2551 2552 string[][] data3x6ExpectedSampleAlgoRNum3Inorder = 2553 [["field_a", "field_b", "field_c"], 2554 ["red", "赤", "23.8"], 2555 ["green", "緑", "0.0072"], 2556 ["black", "黒", "0.983"]]; 2557 2558 string[][] data3x6ExpectedSampleAlgoRNum2Inorder = 2559 [["field_a", "field_b", "field_c"], 2560 ["red", "赤", "23.8"], 2561 ["black", "黒", "0.983"]]; 2562 2563 string[][] data3x6ExpectedSampleAlgoRNum1Inorder = 2564 [["field_a", "field_b", "field_c"], 2565 ["green", "緑", "0.0072"]]; 2566 2567 /* Reservoir inorder */ 2568 string[][] data3x6ExpectedSampleCompatNum6Inorder = 2569 [["field_a", "field_b", "field_c"], 2570 ["red", "赤", "23.8"], 2571 ["green", "緑", "0.0072"], 2572 ["white", "白", "1.65"], 2573 ["yellow", "黄", "12"], 2574 ["blue", "青", "12"], 2575 ["black", "黒", "0.983"]]; 2576 2577 string[][] data3x6ExpectedSampleCompatNum5Inorder = 2578 [["field_a", "field_b", "field_c"], 2579 ["green", "緑", "0.0072"], 2580 ["white", "白", "1.65"], 2581 ["yellow", "黄", "12"], 2582 ["blue", "青", "12"], 2583 ["black", "黒", "0.983"]]; 2584 2585 string[][] data3x6ExpectedSampleCompatNum4Inorder = 2586 [["field_a", "field_b", "field_c"], 2587 ["white", "白", "1.65"], 2588 ["yellow", "黄", "12"], 2589 ["blue", "青", "12"], 2590 ["black", "黒", "0.983"]]; 2591 2592 string[][] data3x6ExpectedSampleCompatNum3Inorder = 2593 [["field_a", "field_b", "field_c"], 2594 ["yellow", "黄", "12"], 2595 ["blue", "青", "12"], 2596 ["black", "黒", "0.983"]]; 2597 2598 string[][] data3x6ExpectedSampleCompatNum2Inorder = 2599 [["field_a", "field_b", "field_c"], 2600 ["yellow", "黄", "12"], 2601 ["black", "黒", "0.983"]]; 2602 2603 string[][] data3x6ExpectedSampleCompatNum1Inorder = 2604 [["field_a", "field_b", "field_c"], 2605 ["yellow", "黄", "12"]]; 2606 2607 2608 /* Reservoir inorder with probabilities. */ 2609 string[][] data3x6ExpectedSampleCompatNum6ProbsInorder = 2610 [["random_value", "field_a", "field_b", "field_c"], 2611 ["0.010968807619065046", "red", "赤", "23.8"], 2612 ["0.15929344086907804", "green", "緑", "0.0072"], 2613 ["0.49287854949943721", "white", "白", "1.65"], 2614 ["0.96055546286515892", "yellow", "黄", "12"], 2615 ["0.52525980887003243", "blue", "青", "12"], 2616 ["0.75710153928957880", "black", "黒", "0.983"]]; 2617 2618 string[][] data3x6ExpectedSampleCompatNum5ProbsInorder = 2619 [["random_value", "field_a", "field_b", "field_c"], 2620 ["0.15929344086907804", "green", "緑", "0.0072"], 2621 ["0.49287854949943721", "white", "白", "1.65"], 2622 ["0.96055546286515892", "yellow", "黄", "12"], 2623 ["0.52525980887003243", "blue", "青", "12"], 2624 ["0.75710153928957880", "black", "黒", "0.983"]]; 2625 2626 string[][] data3x6ExpectedSampleCompatNum4ProbsInorder = 2627 [["random_value", "field_a", "field_b", "field_c"], 2628 ["0.49287854949943721", "white", "白", "1.65"], 2629 ["0.96055546286515892", "yellow", "黄", "12"], 2630 ["0.52525980887003243", "blue", "青", "12"], 2631 ["0.75710153928957880", "black", "黒", "0.983"]]; 2632 2633 string[][] data3x6ExpectedSampleCompatNum3ProbsInorder = 2634 [["random_value", "field_a", "field_b", "field_c"], 2635 ["0.96055546286515892", "yellow", "黄", "12"], 2636 ["0.52525980887003243", "blue", "青", "12"], 2637 ["0.75710153928957880", "black", "黒", "0.983"]]; 2638 2639 string[][] data3x6ExpectedSampleCompatNum2ProbsInorder = 2640 [["random_value", "field_a", "field_b", "field_c"], 2641 ["0.96055546286515892", "yellow", "黄", "12"], 2642 ["0.75710153928957880", "black", "黒", "0.983"]]; 2643 2644 string[][] data3x6ExpectedSampleCompatNum1ProbsInorder = 2645 [["random_value", "field_a", "field_b", "field_c"], 2646 ["0.96055546286515892", "yellow", "黄", "12"]]; 2647 2648 string[][] data3x6ExpectedWt3Num6Inorder = 2649 [["field_a", "field_b", "field_c"], 2650 ["red", "赤", "23.8"], 2651 ["green", "緑", "0.0072"], 2652 ["white", "白", "1.65"], 2653 ["yellow", "黄", "12"], 2654 ["blue", "青", "12"], 2655 ["black", "黒", "0.983"]]; 2656 2657 string[][] data3x6ExpectedWt3Num5Inorder = 2658 [["field_a", "field_b", "field_c"], 2659 ["green", "緑", "0.0072"], 2660 ["white", "白", "1.65"], 2661 ["yellow", "黄", "12"], 2662 ["blue", "青", "12"], 2663 ["black", "黒", "0.983"]]; 2664 2665 string[][] data3x6ExpectedWt3Num4Inorder = 2666 [["field_a", "field_b", "field_c"], 2667 ["white", "白", "1.65"], 2668 ["yellow", "黄", "12"], 2669 ["blue", "青", "12"], 2670 ["black", "黒", "0.983"]]; 2671 2672 string[][] data3x6ExpectedWt3Num3Inorder = 2673 [["field_a", "field_b", "field_c"], 2674 ["yellow", "黄", "12"], 2675 ["blue", "青", "12"], 2676 ["black", "黒", "0.983"]]; 2677 2678 string[][] data3x6ExpectedWt3Num2Inorder = 2679 [["field_a", "field_b", "field_c"], 2680 ["yellow", "黄", "12"], 2681 ["black", "黒", "0.983"]]; 2682 2683 string[][] data3x6ExpectedWt3Num1Inorder = 2684 [["field_a", "field_b", "field_c"], 2685 ["yellow", "黄", "12"]]; 2686 2687 2688 string[][] data3x6ExpectedBernoulliProbsP100 = 2689 [["random_value", "field_a", "field_b", "field_c"], 2690 ["0.010968807619065046", "red", "赤", "23.8"], 2691 ["0.15929344086907804", "green", "緑", "0.0072"], 2692 ["0.49287854949943721", "white", "白", "1.65"], 2693 ["0.96055546286515892", "yellow", "黄", "12"], 2694 ["0.52525980887003243", "blue", "青", "12"], 2695 ["0.75710153928957880", "black", "黒", "0.983"]]; 2696 2697 string[][] data3x6ExpectedBernoulliCompatProbsP60 = 2698 [["random_value", "field_a", "field_b", "field_c"], 2699 ["0.010968807619065046", "red", "赤", "23.8"], 2700 ["0.15929344086907804", "green", "緑", "0.0072"], 2701 ["0.49287854949943721", "white", "白", "1.65"], 2702 ["0.52525980887003243", "blue", "青", "12"]]; 2703 2704 string[][] data3x6ExpectedBernoulliSkipP40 = 2705 [["field_a", "field_b", "field_c"], 2706 ["red", "赤", "23.8"], 2707 ["green", "緑", "0.0072"], 2708 ["yellow", "黄", "12"]]; 2709 2710 string[][] data3x6ExpectedBernoulliCompatP60 = 2711 [["field_a", "field_b", "field_c"], 2712 ["red", "赤", "23.8"], 2713 ["green", "緑", "0.0072"], 2714 ["white", "白", "1.65"], 2715 ["blue", "青", "12"]]; 2716 2717 string[][] data3x6ExpectedDistinctK1K3P60 = 2718 [["field_a", "field_b", "field_c"], 2719 ["green", "緑", "0.0072"], 2720 ["white", "白", "1.65"], 2721 ["blue", "青", "12"]]; 2722 2723 string[][] data3x6ExpectedDistinctK1K3P60Probs = 2724 [["random_value", "field_a", "field_b", "field_c"], 2725 ["0", "green", "緑", "0.0072"], 2726 ["0", "white", "白", "1.65"], 2727 ["0", "blue", "青", "12"]]; 2728 2729 string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom = 2730 [["custom_random_value_header", "field_a", "field_b", "field_c"], 2731 ["0", "green", "緑", "0.0072"], 2732 ["0", "white", "白", "1.65"], 2733 ["0", "blue", "青", "12"]]; 2734 2735 string[][] data3x6ExpectedDistinctK2P2ProbsInorder = 2736 [["random_value", "field_a", "field_b", "field_c"], 2737 ["1", "red", "赤", "23.8"], 2738 ["0", "green", "緑", "0.0072"], 2739 ["0", "white", "白", "1.65"], 2740 ["1", "yellow", "黄", "12"], 2741 ["3", "blue", "青", "12"], 2742 ["2", "black", "黒", "0.983"]]; 2743 2744 string[][] data3x6ExpectedPermuteWt3Probs = 2745 [["random_value", "field_a", "field_b", "field_c"], 2746 ["0.99665198757645390", "yellow", "黄", "12"], 2747 ["0.94775884809836686", "blue", "青", "12"], 2748 ["0.82728234682286661", "red", "赤", "23.8"], 2749 ["0.75346697377181959", "black", "黒", "0.983"], 2750 ["0.65130103496422487", "white", "白", "1.65"], 2751 ["1.5636943712879866e-111", "green", "緑", "0.0072"]]; 2752 2753 string[][] data3x6ExpectedWt3ProbsInorder = 2754 [["random_value", "field_a", "field_b", "field_c"], 2755 ["0.82728234682286661", "red", "赤", "23.8"], 2756 ["1.5636943712879866e-111", "green", "緑", "0.0072"], 2757 ["0.65130103496422487", "white", "白", "1.65"], 2758 ["0.99665198757645390", "yellow", "黄", "12"], 2759 ["0.94775884809836686", "blue", "青", "12"], 2760 ["0.75346697377181959", "black", "黒", "0.983"]]; 2761 2762 string[][] data3x6ExpectedPermuteWt3 = 2763 [["field_a", "field_b", "field_c"], 2764 ["yellow", "黄", "12"], 2765 ["blue", "青", "12"], 2766 ["red", "赤", "23.8"], 2767 ["black", "黒", "0.983"], 2768 ["white", "白", "1.65"], 2769 ["green", "緑", "0.0072"]]; 2770 2771 2772 string[][] data3x6ExpectedReplaceNum10 = 2773 [["field_a", "field_b", "field_c"], 2774 ["black", "黒", "0.983"], 2775 ["green", "緑", "0.0072"], 2776 ["green", "緑", "0.0072"], 2777 ["red", "赤", "23.8"], 2778 ["yellow", "黄", "12"], 2779 ["red", "赤", "23.8"], 2780 ["white", "白", "1.65"], 2781 ["yellow", "黄", "12"], 2782 ["yellow", "黄", "12"], 2783 ["white", "白", "1.65"], 2784 ]; 2785 2786 string[][] data3x6ExpectedReplaceNum10V77 = 2787 [["field_a", "field_b", "field_c"], 2788 ["black", "黒", "0.983"], 2789 ["red", "赤", "23.8"], 2790 ["black", "黒", "0.983"], 2791 ["yellow", "黄", "12"], 2792 ["green", "緑", "0.0072"], 2793 ["green", "緑", "0.0072"], 2794 ["green", "緑", "0.0072"], 2795 ["yellow", "黄", "12"], 2796 ["blue", "青", "12"], 2797 ["white", "白", "1.65"], 2798 ]; 2799 2800 /* Using a different static seed. */ 2801 string[][] data3x6ExpectedPermuteCompatV41Probs = 2802 [["random_value", "field_a", "field_b", "field_c"], 2803 ["0.68057272653095424", "green", "緑", "0.0072"], 2804 ["0.67681624367833138", "blue", "青", "12"], 2805 ["0.32097338931635022", "yellow", "黄", "12"], 2806 ["0.25092361867427826", "red", "赤", "23.8"], 2807 ["0.15535934292711318", "black", "黒", "0.983"], 2808 ["0.046095821075141430", "white", "白", "1.65"]]; 2809 2810 string[][] data3x6ExpectedBernoulliCompatP60V41Probs = 2811 [["random_value", "field_a", "field_b", "field_c"], 2812 ["0.25092361867427826", "red", "赤", "23.8"], 2813 ["0.046095821075141430", "white", "白", "1.65"], 2814 ["0.32097338931635022", "yellow", "黄", "12"], 2815 ["0.15535934292711318", "black", "黒", "0.983"]]; 2816 2817 string[][] data3x6ExpectedPermuteWt3V41Probs = 2818 [["random_value", "field_a", "field_b", "field_c"], 2819 ["0.96799377498910666", "blue", "青", "12"], 2820 ["0.94356245792573568", "red", "赤", "23.8"], 2821 ["0.90964601024271996", "yellow", "黄", "12"], 2822 ["0.15491658409260103", "white", "白", "1.65"], 2823 ["0.15043620392537033", "black", "黒", "0.983"], 2824 ["6.1394674830701461e-24", "green", "緑", "0.0072"]]; 2825 2826 string[][] data3x6ExpectedWt3V41ProbsInorder = 2827 [["random_value", "field_a", "field_b", "field_c"], 2828 ["0.94356245792573568", "red", "赤", "23.8"], 2829 ["6.1394674830701461e-24", "green", "緑", "0.0072"], 2830 ["0.15491658409260103", "white", "白", "1.65"], 2831 ["0.90964601024271996", "yellow", "黄", "12"], 2832 ["0.96799377498910666", "blue", "青", "12"], 2833 ["0.15043620392537033", "black", "黒", "0.983"]]; 2834 2835 2836 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 2837 string[][] combo1ExpectedPermuteCompat = 2838 [["field_a", "field_b", "field_c"], 2839 ["yellow", "黄", "12"], 2840 ["tan", "タン", "8.5"], 2841 ["brown", "褐色", "29.2"], 2842 ["green", "緑", "0.0072"], 2843 ["red", "赤", "23.8"], 2844 ["purple", "紫の", "42"], 2845 ["black", "黒", "0.983"], 2846 ["white", "白", "1.65"], 2847 ["gray", "グレー", "6.2"], 2848 ["blue", "青", "12"], 2849 ["pink", "ピンク", "1.1"], 2850 ["orange", "オレンジ", "2.5"]]; 2851 2852 string[][] combo1ExpectedPermuteCompatProbs = 2853 [["random_value", "field_a", "field_b", "field_c"], 2854 ["0.97088520275428891", "yellow", "黄", "12"], 2855 ["0.96055546286515892", "tan", "タン", "8.5"], 2856 ["0.81756894313730299", "brown", "褐色", "29.2"], 2857 ["0.75710153928957880", "green", "緑", "0.0072"], 2858 ["0.52525980887003243", "red", "赤", "23.8"], 2859 ["0.49287854949943721", "purple", "紫の", "42"], 2860 ["0.47081507067196071", "black", "黒", "0.983"], 2861 ["0.38388182921335101", "white", "白", "1.65"], 2862 ["0.29215990612283349", "gray", "グレー", "6.2"], 2863 ["0.24033216014504433", "blue", "青", "12"], 2864 ["0.15929344086907804", "pink", "ピンク", "1.1"], 2865 ["0.010968807619065046", "orange", "オレンジ", "2.5"]]; 2866 2867 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 2868 string[][] combo1ExpectedProbsInorder = 2869 [["random_value", "field_a", "field_b", "field_c"], 2870 ["0.010968807619065046", "orange", "オレンジ", "2.5"], 2871 ["0.15929344086907804", "pink", "ピンク", "1.1"], 2872 ["0.49287854949943721", "purple", "紫の", "42"], 2873 ["0.96055546286515892", "tan", "タン", "8.5"], 2874 ["0.52525980887003243", "red", "赤", "23.8"], 2875 ["0.75710153928957880", "green", "緑", "0.0072"], 2876 ["0.38388182921335101", "white", "白", "1.65"], 2877 ["0.97088520275428891", "yellow", "黄", "12"], 2878 ["0.24033216014504433", "blue", "青", "12"], 2879 ["0.47081507067196071", "black", "黒", "0.983"], 2880 ["0.81756894313730299", "brown", "褐色", "29.2"], 2881 ["0.29215990612283349", "gray", "グレー", "6.2"]]; 2882 2883 string[][] combo1ExpectedBernoulliCompatP50Probs = 2884 [["random_value", "field_a", "field_b", "field_c"], 2885 ["0.010968807619065046", "orange", "オレンジ", "2.5"], 2886 ["0.15929344086907804", "pink", "ピンク", "1.1"], 2887 ["0.49287854949943721", "purple", "紫の", "42"], 2888 ["0.38388182921335101", "white", "白", "1.65"], 2889 ["0.24033216014504433", "blue", "青", "12"], 2890 ["0.47081507067196071", "black", "黒", "0.983"], 2891 ["0.29215990612283349", "gray", "グレー", "6.2"]]; 2892 2893 string[][] combo1ExpectedBernoulliCompatP40 = 2894 [["field_a", "field_b", "field_c"], 2895 ["orange", "オレンジ", "2.5"], 2896 ["pink", "ピンク", "1.1"], 2897 ["white", "白", "1.65"], 2898 ["blue", "青", "12"], 2899 ["gray", "グレー", "6.2"]]; 2900 2901 string[][] combo1ExpectedDistinctK1P40 = 2902 [["field_a", "field_b", "field_c"], 2903 ["orange", "オレンジ", "2.5"], 2904 ["red", "赤", "23.8"], 2905 ["green", "緑", "0.0072"], 2906 ["blue", "青", "12"], 2907 ["black", "黒", "0.983"]]; 2908 2909 string[][] combo1ExpectedPermuteWt3Probs = 2910 [["random_value", "field_a", "field_b", "field_c"], 2911 ["0.99754077523718754", "yellow", "黄", "12"], 2912 ["0.99527665440088786", "tan", "タン", "8.5"], 2913 ["0.99312578945741659", "brown", "褐色", "29.2"], 2914 ["0.98329602553389361", "purple", "紫の", "42"], 2915 ["0.97330961938083660", "red", "赤", "23.8"], 2916 ["0.88797551521739648", "blue", "青", "12"], 2917 ["0.81999230489041786", "gray", "グレー", "6.2"], 2918 ["0.55975569204250941", "white", "白", "1.65"], 2919 ["0.46472135609205739", "black", "黒", "0.983"], 2920 ["0.18824582704191337", "pink", "ピンク", "1.1"], 2921 ["0.16446131853299920", "orange", "オレンジ", "2.5"], 2922 ["1.6438086931020549e-17", "green", "緑", "0.0072"]]; 2923 2924 string[][] combo1ExpectedPermuteWt3 = 2925 [["field_a", "field_b", "field_c"], 2926 ["yellow", "黄", "12"], 2927 ["tan", "タン", "8.5"], 2928 ["brown", "褐色", "29.2"], 2929 ["purple", "紫の", "42"], 2930 ["red", "赤", "23.8"], 2931 ["blue", "青", "12"], 2932 ["gray", "グレー", "6.2"], 2933 ["white", "白", "1.65"], 2934 ["black", "黒", "0.983"], 2935 ["pink", "ピンク", "1.1"], 2936 ["orange", "オレンジ", "2.5"], 2937 ["green", "緑", "0.0072"]]; 2938 2939 string[][] combo1ExpectedSampleAlgoRNum4 = 2940 [["field_a", "field_b", "field_c"], 2941 ["blue", "青", "12"], 2942 ["gray", "グレー", "6.2"], 2943 ["brown", "褐色", "29.2"], 2944 ["white", "白", "1.65"]]; 2945 2946 string[][] combo1ExpectedSampleAlgoRNum4Inorder = 2947 [["field_a", "field_b", "field_c"], 2948 ["white", "白", "1.65"], 2949 ["blue", "青", "12"], 2950 ["brown", "褐色", "29.2"], 2951 ["gray", "グレー", "6.2"]]; 2952 2953 string[][] combo1ExpectedReplaceNum10 = 2954 [["field_a", "field_b", "field_c"], 2955 ["gray", "グレー", "6.2"], 2956 ["yellow", "黄", "12"], 2957 ["yellow", "黄", "12"], 2958 ["white", "白", "1.65"], 2959 ["tan", "タン", "8.5"], 2960 ["white", "白", "1.65"], 2961 ["blue", "青", "12"], 2962 ["black", "黒", "0.983"], 2963 ["tan", "タン", "8.5"], 2964 ["purple", "紫の", "42"]]; 2965 2966 /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */ 2967 string[][] data1x200 = 2968 [["field_a"], 2969 ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"], 2970 ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"], 2971 ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"], 2972 ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"], 2973 ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"], 2974 ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"], 2975 ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"], 2976 ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"], 2977 ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"], 2978 ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"], 2979 ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"], 2980 ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"], 2981 ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"], 2982 ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"], 2983 ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"], 2984 ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"], 2985 ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"], 2986 ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"], 2987 ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"], 2988 ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"], 2989 ]; 2990 2991 string fpath_data1x200 = buildPath(testDir, "data1x200.tsv"); 2992 string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv"); 2993 writeUnittestTsvFile(fpath_data1x200, data1x200); 2994 writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1 .. $]); 2995 2996 string[][] data1x200ExpectedBernoulliSkipV333P01 = 2997 [["field_a"], 2998 ["077"], 2999 ["119"]]; 3000 3001 string[][] data1x200ExpectedBernoulliSkipV333P02 = 3002 [["field_a"], 3003 ["038"], 3004 ["059"], 3005 ["124"], 3006 ["161"], 3007 ["162"], 3008 ["183"]]; 3009 3010 string[][] data1x200ExpectedBernoulliSkipV333P03 = 3011 [["field_a"], 3012 ["025"], 3013 ["039"], 3014 ["082"], 3015 ["107"], 3016 ["108"], 3017 ["122"], 3018 ["136"], 3019 ["166"], 3020 ["182"]]; 3021 3022 string[][] data1x200ExpectedBernoulliCompatV333P01 = 3023 [["field_a"], 3024 ["072"]]; 3025 3026 string[][] data1x200ExpectedBernoulliCompatV333P02 = 3027 [["field_a"], 3028 ["004"], 3029 ["072"]]; 3030 3031 string[][] data1x200ExpectedBernoulliCompatV333P03 = 3032 [["field_a"], 3033 ["004"], 3034 ["072"], 3035 ["181"]]; 3036 3037 /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files, 3038 * only expected results. The header is from 3x0, the results are offset 1-position 3039 * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line. 3040 */ 3041 string[][] combo2ExpectedBernoulliSkipV333P03 = 3042 [["field_a", "field_b", "field_c"], 3043 ["024"], 3044 ["038"], 3045 ["081"], 3046 ["106"], 3047 ["107"], 3048 ["121"], 3049 ["135"], 3050 ["165"], 3051 ["181"]]; 3052 3053 3054 /* 1x10 - Simple 1-column file. */ 3055 string[][] data1x10 = 3056 [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]]; 3057 string fpath_data1x10 = buildPath(testDir, "data1x10.tsv"); 3058 string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv"); 3059 writeUnittestTsvFile(fpath_data1x10, data1x10); 3060 writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1 .. $]); 3061 3062 string[][] data1x10ExpectedPermuteCompat = 3063 [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]]; 3064 3065 string[][] data1x10ExpectedPermuteWt1 = 3066 [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]]; 3067 3068 /* 2x10a - Uniform distribution [0,1]. */ 3069 string[][] data2x10a = 3070 [["line", "weight"], 3071 ["1", "0.26788837"], 3072 ["2", "0.06601298"], 3073 ["3", "0.38627527"], 3074 ["4", "0.47379424"], 3075 ["5", "0.02966641"], 3076 ["6", "0.05636231"], 3077 ["7", "0.70529242"], 3078 ["8", "0.91836862"], 3079 ["9", "0.99103720"], 3080 ["10", "0.31401740"]]; 3081 3082 string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv"); 3083 writeUnittestTsvFile(fpath_data2x10a, data2x10a); 3084 3085 string[][] data2x10aExpectedPermuteWt2Probs = 3086 [["random_value", "line", "weight"], 3087 ["0.96833865494543658", "8", "0.91836862"], 3088 ["0.91856842054413923", "4", "0.47379424"], 3089 ["0.25730832087795091", "7", "0.70529242"], 3090 ["0.23725317907018120", "9", "0.99103720"], 3091 ["0.16016096701872204", "3", "0.38627527"], 3092 ["0.090819662667243381", "10", "0.31401740"], 3093 ["0.0071764539244361172", "6", "0.05636231"], 3094 ["0.000000048318642951630057", "1", "0.26788837"], 3095 ["0.00000000037525692966535517", "5", "0.02966641"], 3096 ["8.2123247880095796e-13", "2", "0.06601298"]]; 3097 3098 /* 2x10b - Uniform distribution [0,1000]. */ 3099 string[][] data2x10b = 3100 [["line", "weight"], 3101 ["1", "761"], 3102 ["2", "432"], 3103 ["3", "103"], 3104 ["4", "448"], 3105 ["5", "750"], 3106 ["6", "711"], 3107 ["7", "867"], 3108 ["8", "841"], 3109 ["9", "963"], 3110 ["10", "784"]]; 3111 3112 string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv"); 3113 writeUnittestTsvFile(fpath_data2x10b, data2x10b); 3114 3115 string[][] data2x10bExpectedPermuteWt2Probs = 3116 [["random_value", "line", "weight"], 3117 ["0.99996486739067969", "8", "841"], 3118 ["0.99991017467137211", "4", "448"], 3119 ["0.99960871524873662", "6", "711"], 3120 ["0.99914188537143800", "5", "750"], 3121 ["0.99903963250274785", "10", "784"], 3122 ["0.99889631825931946", "7", "867"], 3123 ["0.99852058315191139", "9", "963"], 3124 ["0.99575669679158918", "2", "432"], 3125 ["0.99408758732050595", "1", "761"], 3126 ["0.99315467761212362", "3", "103"]]; 3127 3128 /* 2x10c - Logarithmic distribution in random order. */ 3129 string[][] data2x10c = 3130 [["line", "weight"], 3131 ["1", "31.85"], 3132 ["2", "17403.31"], 3133 ["3", "653.84"], 3134 ["4", "8.23"], 3135 ["5", "2671.04"], 3136 ["6", "26226.08"], 3137 ["7", "1.79"], 3138 ["8", "354.56"], 3139 ["9", "35213.81"], 3140 ["10", "679.29"]]; 3141 3142 string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv"); 3143 writeUnittestTsvFile(fpath_data2x10c, data2x10c); 3144 3145 string[][] data2x10cExpectedPermuteWt2Probs = 3146 [["random_value", "line", "weight"], 3147 ["0.99998939008709697", "6", "26226.08"], 3148 ["0.99995951291695517", "9", "35213.81"], 3149 ["0.99991666907613541", "8", "354.56"], 3150 ["0.99989445052186410", "2", "17403.31"], 3151 ["0.99975897602861630", "5", "2671.04"], 3152 ["0.99891852769877643", "3", "653.84"], 3153 ["0.99889167752782515", "10", "679.29"], 3154 ["0.99512207506850148", "4", "8.23"], 3155 ["0.86789371584259023", "1", "31.85"], 3156 ["0.58574438162915610", "7", "1.79"]]; 3157 3158 /* 2x10d. Logarithmic distribution in ascending order. */ 3159 string[][] data2x10d = 3160 [["line", "weight"], 3161 ["1", "1.79"], 3162 ["2", "8.23"], 3163 ["3", "31.85"], 3164 ["4", "354.56"], 3165 ["5", "653.84"], 3166 ["6", "679.29"], 3167 ["7", "2671.04"], 3168 ["8", "17403.31"], 3169 ["9", "26226.08"], 3170 ["10", "35213.81"]]; 3171 3172 string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv"); 3173 writeUnittestTsvFile(fpath_data2x10d, data2x10d); 3174 3175 string[][] data2x10dExpectedPermuteWt2Probs = 3176 [["random_value", "line", "weight"], 3177 ["0.99999830221846353", "8", "17403.31"], 3178 ["0.99997860834041397", "10", "35213.81"], 3179 ["0.99994563828986716", "9", "26226.08"], 3180 ["0.99988650363575737", "4", "354.56"], 3181 ["0.99964161939190088", "7", "2671.04"], 3182 ["0.99959045338948649", "6", "679.29"], 3183 ["0.99901574490639788", "5", "653.84"], 3184 ["0.97803163304747431", "3", "31.85"], 3185 ["0.79994791806910948", "2", "8.23"], 3186 ["0.080374261239949119", "1", "1.79"]]; 3187 3188 /* 2x10e. Logarithmic distribution in descending order. */ 3189 string[][] data2x10e = 3190 [["line", "weight"], 3191 ["1", "35213.81"], 3192 ["2", "26226.08"], 3193 ["3", "17403.31"], 3194 ["4", "2671.04"], 3195 ["5", "679.29"], 3196 ["6", "653.84"], 3197 ["7", "354.56"], 3198 ["8", "31.85"], 3199 ["9", "8.23"], 3200 ["10", "1.79"]]; 3201 string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv"); 3202 writeUnittestTsvFile(fpath_data2x10e, data2x10e); 3203 3204 string[][] data2x10eExpectedPermuteWt2Probs = 3205 [["random_value", "line", "weight"], 3206 ["0.99998493348975237", "4", "2671.04"], 3207 ["0.99995934807202624", "3", "17403.31"], 3208 ["0.99992995739727453", "2", "26226.08"], 3209 ["0.99987185679245649", "1", "35213.81"], 3210 ["0.99957451563173938", "6", "653.84"], 3211 ["0.99907273650209583", "8", "31.85"], 3212 ["0.99905260312968946", "5", "679.29"], 3213 ["0.99730333650516401", "7", "354.56"], 3214 ["0.84093902435227808", "9", "8.23"], 3215 ["0.65650015926290028", "10", "1.79"]]; 3216 3217 /* Data sets for distinct sampling. */ 3218 string[][] data5x25 = 3219 [["ID", "Shape", "Color", "Size", "Weight"], 3220 ["01", "circle", "red", "S", "10"], 3221 ["02", "circle", "black", "L", "20"], 3222 ["03", "square", "black", "L", "20"], 3223 ["04", "circle", "green", "L", "30"], 3224 ["05", "ellipse", "red", "S", "20"], 3225 ["06", "triangle", "red", "S", "10"], 3226 ["07", "triangle", "red", "L", "20"], 3227 ["08", "square", "black", "S", "10"], 3228 ["09", "circle", "black", "S", "20"], 3229 ["10", "square", "green", "L", "20"], 3230 ["11", "triangle", "red", "L", "20"], 3231 ["12", "circle", "green", "L", "30"], 3232 ["13", "ellipse", "red", "S", "20"], 3233 ["14", "circle", "green", "L", "30"], 3234 ["15", "ellipse", "red", "L", "30"], 3235 ["16", "square", "red", "S", "10"], 3236 ["17", "circle", "black", "L", "20"], 3237 ["18", "square", "red", "S", "20"], 3238 ["19", "square", "black", "L", "20"], 3239 ["20", "circle", "red", "S", "10"], 3240 ["21", "ellipse", "black", "L", "30"], 3241 ["22", "triangle", "red", "L", "30"], 3242 ["23", "circle", "green", "S", "20"], 3243 ["24", "square", "green", "L", "20"], 3244 ["25", "circle", "red", "S", "10"], 3245 ]; 3246 3247 string fpath_data5x25 = buildPath(testDir, "data5x25.tsv"); 3248 string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv"); 3249 writeUnittestTsvFile(fpath_data5x25, data5x25); 3250 writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1 .. $]); 3251 3252 string[][] data5x25ExpectedDistinctK2P40 = 3253 [["ID", "Shape", "Color", "Size", "Weight"], 3254 ["03", "square", "black", "L", "20"], 3255 ["05", "ellipse", "red", "S", "20"], 3256 ["08", "square", "black", "S", "10"], 3257 ["10", "square", "green", "L", "20"], 3258 ["13", "ellipse", "red", "S", "20"], 3259 ["15", "ellipse", "red", "L", "30"], 3260 ["16", "square", "red", "S", "10"], 3261 ["18", "square", "red", "S", "20"], 3262 ["19", "square", "black", "L", "20"], 3263 ["21", "ellipse", "black", "L", "30"], 3264 ["24", "square", "green", "L", "20"], 3265 ]; 3266 3267 string[][] data5x25ExpectedDistinctK2K4P20 = 3268 [["ID", "Shape", "Color", "Size", "Weight"], 3269 ["03", "square", "black", "L", "20"], 3270 ["07", "triangle", "red", "L", "20"], 3271 ["08", "square", "black", "S", "10"], 3272 ["10", "square", "green", "L", "20"], 3273 ["11", "triangle", "red", "L", "20"], 3274 ["16", "square", "red", "S", "10"], 3275 ["18", "square", "red", "S", "20"], 3276 ["19", "square", "black", "L", "20"], 3277 ["22", "triangle", "red", "L", "30"], 3278 ["24", "square", "green", "L", "20"], 3279 ]; 3280 3281 string[][] data5x25ExpectedDistinctK2K3K4P20 = 3282 [["ID", "Shape", "Color", "Size", "Weight"], 3283 ["04", "circle", "green", "L", "30"], 3284 ["07", "triangle", "red", "L", "20"], 3285 ["09", "circle", "black", "S", "20"], 3286 ["11", "triangle", "red", "L", "20"], 3287 ["12", "circle", "green", "L", "30"], 3288 ["14", "circle", "green", "L", "30"], 3289 ["16", "square", "red", "S", "10"], 3290 ["18", "square", "red", "S", "20"], 3291 ["22", "triangle", "red", "L", "30"], 3292 ]; 3293 3294 /* Fields 2 and 4 from data5x25. Distinct rows should be the same for equiv keys. */ 3295 string[][] data2x25 = 3296 [["Shape", "Size"], 3297 ["circle", "S"], 3298 ["circle", "L"], 3299 ["square", "L"], 3300 ["circle", "L"], 3301 ["ellipse", "S"], 3302 ["triangle", "S"], 3303 ["triangle", "L"], 3304 ["square", "S"], 3305 ["circle", "S"], 3306 ["square", "L"], 3307 ["triangle", "L"], 3308 ["circle", "L"], 3309 ["ellipse", "S"], 3310 ["circle", "L"], 3311 ["ellipse", "L"], 3312 ["square", "S"], 3313 ["circle", "L"], 3314 ["square", "S"], 3315 ["square", "L"], 3316 ["circle", "S"], 3317 ["ellipse", "L"], 3318 ["triangle", "L"], 3319 ["circle", "S"], 3320 ["square", "L"], 3321 ["circle", "S"], 3322 ]; 3323 3324 string fpath_data2x25 = buildPath(testDir, "data2x25.tsv"); 3325 string fpath_data2x25_noheader = buildPath(testDir, "data2x25_noheader.tsv"); 3326 writeUnittestTsvFile(fpath_data2x25, data2x25); 3327 writeUnittestTsvFile(fpath_data2x25_noheader, data2x25[1 .. $]); 3328 3329 string[][] data2x25ExpectedDistinctK1K2P20 = 3330 [["Shape", "Size"], 3331 ["square", "L"], 3332 ["triangle", "L"], 3333 ["square", "S"], 3334 ["square", "L"], 3335 ["triangle", "L"], 3336 ["square", "S"], 3337 ["square", "S"], 3338 ["square", "L"], 3339 ["triangle", "L"], 3340 ["square", "L"], 3341 ]; 3342 3343 string[][] data1x25 = 3344 [["Shape-Size"], 3345 ["circle-S"], 3346 ["circle-L"], 3347 ["square-L"], 3348 ["circle-L"], 3349 ["ellipse-S"], 3350 ["triangle-S"], 3351 ["triangle-L"], 3352 ["square-S"], 3353 ["circle-S"], 3354 ["square-L"], 3355 ["triangle-L"], 3356 ["circle-L"], 3357 ["ellipse-S"], 3358 ["circle-L"], 3359 ["ellipse-L"], 3360 ["square-S"], 3361 ["circle-L"], 3362 ["square-S"], 3363 ["square-L"], 3364 ["circle-S"], 3365 ["ellipse-L"], 3366 ["triangle-L"], 3367 ["circle-S"], 3368 ["square-L"], 3369 ["circle-S"], 3370 ]; 3371 3372 string fpath_data1x25 = buildPath(testDir, "data1x25.tsv"); 3373 string fpath_data1x25_noheader = buildPath(testDir, "data1x25_noheader.tsv"); 3374 writeUnittestTsvFile(fpath_data1x25, data1x25); 3375 writeUnittestTsvFile(fpath_data1x25_noheader, data1x25[1 .. $]); 3376 3377 string[][] data1x25ExpectedDistinctK1P20 = 3378 [["Shape-Size"], 3379 ["triangle-L"], 3380 ["square-S"], 3381 ["triangle-L"], 3382 ["ellipse-L"], 3383 ["square-S"], 3384 ["square-S"], 3385 ["ellipse-L"], 3386 ["triangle-L"], 3387 ]; 3388 3389 string[][] data1x25ExpectedDistinctK1P20Probs = 3390 [["random_value", "Shape-Size"], 3391 ["0", "triangle-L"], 3392 ["0", "square-S"], 3393 ["0", "triangle-L"], 3394 ["0", "ellipse-L"], 3395 ["0", "square-S"], 3396 ["0", "square-S"], 3397 ["0", "ellipse-L"], 3398 ["0", "triangle-L"], 3399 ]; 3400 3401 string[][] data1x25ExpectedDistinctK1P20ProbsInorder = 3402 [["random_value", "Shape-Size"], 3403 ["1", "circle-S"], 3404 ["4", "circle-L"], 3405 ["2", "square-L"], 3406 ["4", "circle-L"], 3407 ["2", "ellipse-S"], 3408 ["1", "triangle-S"], 3409 ["0", "triangle-L"], 3410 ["0", "square-S"], 3411 ["1", "circle-S"], 3412 ["2", "square-L"], 3413 ["0", "triangle-L"], 3414 ["4", "circle-L"], 3415 ["2", "ellipse-S"], 3416 ["4", "circle-L"], 3417 ["0", "ellipse-L"], 3418 ["0", "square-S"], 3419 ["4", "circle-L"], 3420 ["0", "square-S"], 3421 ["2", "square-L"], 3422 ["1", "circle-S"], 3423 ["0", "ellipse-L"], 3424 ["0", "triangle-L"], 3425 ["1", "circle-S"], 3426 ["2", "square-L"], 3427 ["1", "circle-S"], 3428 ]; 3429 3430 /* 3431 * Enough setup! Actually run some tests! 3432 */ 3433 3434 /* Shuffling tests. Headers, static seed, compatibility mode. With weights and without. */ 3435 testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty); 3436 testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0); 3437 testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1); 3438 testTsvSample(["test-a4", "-H", "-s", "--compatibility-mode", fpath_data3x2], data3x2PermuteCompat); 3439 testTsvSample(["test-a5", "-H", "-s", "--compatibility-mode", fpath_data3x3], data3x3ExpectedPermuteCompat); 3440 testTsvSample(["test-a6", "-H", "-s", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat); 3441 testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs); 3442 testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3); 3443 testTsvSample(["test-a8b", "-H", "-s", "--weight-field", "field_c", fpath_data3x6], data3x6ExpectedPermuteWt3); 3444 testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 3445 testTsvSample(["test-a9b", "-H", "-s", "--print-random", "-w", "field_c", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 3446 testTsvSample(["test-a9c", "-H", "-s", "--print-random", "-w", "f*c", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 3447 testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 3448 testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 3449 testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs); 3450 testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs); 3451 testTsvSample(["test-a13b", "-H", "-v", "41", "-w", "field_c", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs); 3452 testTsvSample(["test-a13c", "--line-buffered", "-H", "-v", "41", "-w", "field_c", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs); 3453 3454 /* Shuffling, without compatibility mode, or with both compatibility and printing. */ 3455 testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty); 3456 testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0); 3457 testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1); 3458 testTsvSample(["test-aa4", "-H", "-s", fpath_data3x2], data3x2PermuteShuffle); 3459 testTsvSample(["test-aa5", "-H", "-s", fpath_data3x3], data3x3ExpectedPermuteSwap); 3460 testTsvSample(["test-aa6", "-H", "-s", fpath_data3x6], data3x6ExpectedPermuteSwap); 3461 testTsvSample(["test-aa7", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3); 3462 testTsvSample(["test-aa8", "-H", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 3463 testTsvSample(["test-aa8b", "-H", "-s", "--print-random", "-w", "field_c", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 3464 testTsvSample(["test-aa9", "-H", "--seed-value", "41", "--print-random", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 3465 3466 /* Reservoir sampling using Algorithm R. 3467 * (Note: reservoirSamplingViaHeap is tested later in the length-based iteration loops.) 3468 */ 3469 testTsvSample(["test-aa10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty); 3470 testTsvSample(["test-aa11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty); 3471 testTsvSample(["test-aa12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x0], data3x0); 3472 testTsvSample(["test-aa13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x0], data3x0); 3473 testTsvSample(["test-aa14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x1], data3x1); 3474 testTsvSample(["test-aa15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x1], data3x1); 3475 testTsvSample(["test-aa16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6); 3476 testTsvSample(["test-aa17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6); 3477 testTsvSample(["test-aa18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5); 3478 testTsvSample(["test-aa19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4); 3479 testTsvSample(["test-aa20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3); 3480 testTsvSample(["test-aa21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2); 3481 testTsvSample(["test-aa22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1); 3482 testTsvSample(["test-aa22b", "--line-buffered", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1); 3483 3484 /* Inorder versions of Algorithm R tests. */ 3485 testTsvSample(["test-ai10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty); 3486 testTsvSample(["test-ai11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty); 3487 testTsvSample(["test-ai12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0); 3488 testTsvSample(["test-ai13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0); 3489 testTsvSample(["test-ai14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1); 3490 testTsvSample(["test-ai15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1); 3491 testTsvSample(["test-ai16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder); 3492 testTsvSample(["test-ai17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder); 3493 testTsvSample(["test-ai18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5Inorder); 3494 testTsvSample(["test-ai19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4Inorder); 3495 testTsvSample(["test-ai20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3Inorder); 3496 testTsvSample(["test-ai21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2Inorder); 3497 testTsvSample(["test-ai22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1Inorder); 3498 3499 /* Bernoulli sampling cases. */ 3500 testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty); 3501 testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0); 3502 testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1); 3503 testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6); 3504 testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6); 3505 testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 3506 testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60); 3507 testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliCompatP60); 3508 testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs); 3509 testTsvSample(["test-a22b", "--line-buffered", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs); 3510 3511 /* Bernoulli sampling with probabilities in skip sampling range or preferring skip sampling. */ 3512 testTsvSample(["test-ab1", "-H", "--seed-value", "333", "--prob", "0.01", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P01); 3513 testTsvSample(["test-ab2", "-H", "--seed-value", "333", "--prob", "0.02", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P02); 3514 testTsvSample(["test-ab3", "-H", "--seed-value", "333", "--prob", "0.03", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03); 3515 testTsvSample(["test-ab4", "-H", "--seed-value", "333", "--prob", "0.01", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P01); 3516 testTsvSample(["test-ab5", "-H", "--seed-value", "333", "--prob", "0.02", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P02); 3517 testTsvSample(["test-ab6", "-H", "--seed-value", "333", "--prob", "0.03", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P03); 3518 testTsvSample(["test-ab7", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40); 3519 testTsvSample(["test-ab7b", "--line-buffered", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40); 3520 3521 /* Distinct sampling cases. */ 3522 testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty); 3523 testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0); 3524 testTsvSample(["test-a24b", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "field_a", fpath_data3x0], data3x0); 3525 testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1); 3526 testTsvSample(["test-a25b", "-H", "-s", "-p", "1.0", "-k", "field_b", fpath_data3x1], data3x1); 3527 testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6); 3528 testTsvSample(["test-a26b", "-H", "-s", "-p", "1.0", "-k", "field_b", fpath_data3x6], data3x6); 3529 testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctK1K3P60); 3530 testTsvSample(["test-a27b", "-H", "-s", "-p", "0.6", "-k", "field_a,field_c", fpath_data3x6], data3x6ExpectedDistinctK1K3P60); 3531 testTsvSample(["test-a27c", "--line-buffered", "-H", "-s", "-p", "0.6", "-k", "field_a,field_c", fpath_data3x6], data3x6ExpectedDistinctK1K3P60); 3532 3533 /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling. 3534 * For weighted sampling, use the weighted cases, but with expected using the original ordering. 3535 */ 3536 testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 3537 testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 3538 testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], 3539 data3x6ExpectedWt3ProbsInorder); 3540 testTsvSample(["test-a30b", "-H", "-s", "--gen-random-inorder", "--weight-field", "field_c", fpath_data3x6], 3541 data3x6ExpectedWt3ProbsInorder); 3542 testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], 3543 data3x6ExpectedWt3V41ProbsInorder); 3544 testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6], 3545 data3x6ExpectedDistinctK1K3P60Probs); 3546 testTsvSample(["test-a32b", "-H", "-s", "-p", "0.6", "-k", "field_a,field_c", "--print-random", fpath_data3x6], 3547 data3x6ExpectedDistinctK1K3P60Probs); 3548 testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header", 3549 "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctK1K3P60ProbsRVCustom); 3550 testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6], 3551 data3x6ExpectedDistinctK2P2ProbsInorder); 3552 testTsvSample(["test-a34b", "--line-buffered", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6], 3553 data3x6ExpectedDistinctK2P2ProbsInorder); 3554 3555 /* Simple random sampling with replacement. */ 3556 testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty); 3557 testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty); 3558 testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0); 3559 testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0); 3560 testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplaceNum3); 3561 testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10); 3562 testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77); 3563 testTsvSample(["test-a41b", "--line-buffered", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77); 3564 3565 /* Shuffling, compatibility mode, without headers. */ 3566 testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1 .. $]); 3567 testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1 .. $]); 3568 testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1 .. $]); 3569 testTsvSample(["test-b4", "-s", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1 .. $]); 3570 testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1 .. $]); 3571 testTsvSample(["test-b6", "-s", "--weight-field", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]); 3572 testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]); 3573 testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]); 3574 testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1 .. $]); 3575 testTsvSample(["test-b9b", "--line-buffered", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1 .. $]); 3576 3577 /* Shuffling, no headers, without compatibility mode, or with printing and compatibility mode. */ 3578 testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1 .. $]); 3579 testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1 .. $]); 3580 testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1 .. $]); 3581 testTsvSample(["test-bb4", "-s", fpath_data3x6_noheader], data3x6ExpectedPermuteSwap[1 .. $]); 3582 testTsvSample(["test-bb5", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]); 3583 testTsvSample(["test-bb6", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]); 3584 testTsvSample(["test-bb7", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]); 3585 testTsvSample(["test-bb7b", "--line-buffered", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]); 3586 3587 /* Reservoir sampling using Algorithm R, no headers. */ 3588 testTsvSample(["test-ac10", "--prefer-algorithm-r", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty); 3589 testTsvSample(["test-ac11", "--prefer-algorithm-r", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty); 3590 testTsvSample(["test-ac14", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x1_noheader], data3x1[1 .. $]); 3591 testTsvSample(["test-ac15", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x1_noheader], data3x1[1 .. $]); 3592 testTsvSample(["test-ac16", "--prefer-algorithm-r", "-s", "--num", "7", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]); 3593 testTsvSample(["test-ac17", "--prefer-algorithm-r", "-s", "--num", "6", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]); 3594 testTsvSample(["test-ac18", "--prefer-algorithm-r", "-s", "--num", "5", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5[1 .. $]); 3595 testTsvSample(["test-ac19", "--prefer-algorithm-r", "-s", "--num", "4", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4[1 .. $]); 3596 testTsvSample(["test-ac20", "--prefer-algorithm-r", "-s", "--num", "3", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3[1 .. $]); 3597 testTsvSample(["test-ac21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2[1 .. $]); 3598 testTsvSample(["test-ac22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1[1 .. $]); 3599 testTsvSample(["test-ac22b", "--line-buffered", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1[1 .. $]); 3600 3601 /* Reservoir sampling using Algorithm R, no headers, inorder output. */ 3602 testTsvSample(["test-aj10", "--prefer-algorithm-r", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty); 3603 testTsvSample(["test-aj11", "--prefer-algorithm-r", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty); 3604 testTsvSample(["test-aj14", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 3605 testTsvSample(["test-aj15", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 3606 testTsvSample(["test-aj16", "--prefer-algorithm-r", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]); 3607 testTsvSample(["test-aj17", "--prefer-algorithm-r", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]); 3608 testTsvSample(["test-aj18", "--prefer-algorithm-r", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5Inorder[1 .. $]); 3609 testTsvSample(["test-aj19", "--prefer-algorithm-r", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4Inorder[1 .. $]); 3610 testTsvSample(["test-aj20", "--prefer-algorithm-r", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3Inorder[1 .. $]); 3611 testTsvSample(["test-aj21", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2Inorder[1 .. $]); 3612 testTsvSample(["test-aj22", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1Inorder[1 .. $]); 3613 testTsvSample(["test-aj22b", "--line-buffered", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1Inorder[1 .. $]); 3614 3615 /* Bernoulli sampling cases. */ 3616 testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1 .. $]); 3617 testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]); 3618 testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]); 3619 testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]); 3620 testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1 .. $]); 3621 testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1 .. $]); 3622 testTsvSample(["test-b15b", "--line-buffered", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1 .. $]); 3623 3624 /* Bernoulli sampling with probabilities in skip sampling range. */ 3625 testTsvSample(["test-bb1", "-v", "333", "-p", "0.01", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P01[1 .. $]); 3626 testTsvSample(["test-bb2", "-v", "333", "-p", "0.02", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P02[1 .. $]); 3627 testTsvSample(["test-bb3", "-v", "333", "-p", "0.03", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1 .. $]); 3628 testTsvSample(["test-bb4", "-v", "333", "-p", "0.01", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P01[1 .. $]); 3629 testTsvSample(["test-bb5", "-v", "333", "-p", "0.02", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P02[1 .. $]); 3630 testTsvSample(["test-bb6", "-v", "333", "-p", "0.03", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P03[1 .. $]); 3631 testTsvSample(["test-bb7", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1 .. $]); 3632 testTsvSample(["test-bb7b", "--line-buffered", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1 .. $]); 3633 3634 /* Distinct sampling cases. */ 3635 testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1 .. $]); 3636 testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]); 3637 testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]); 3638 testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]); 3639 testTsvSample(["test-b19b", "--line-buffered", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]); 3640 3641 /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */ 3642 testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]); 3643 testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1 .. $]); 3644 testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader], 3645 data3x6ExpectedDistinctK1K3P60Probs[1 .. $]); 3646 testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader], 3647 data3x6ExpectedDistinctK2P2ProbsInorder[1 .. $]); 3648 testTsvSample(["test-b24b", "--line-buffered", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader], 3649 data3x6ExpectedDistinctK2P2ProbsInorder[1 .. $]); 3650 3651 /* Simple random sampling with replacement. */ 3652 testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty); 3653 testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty); 3654 testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplaceNum3[1 .. $]); 3655 testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10[1 .. $]); 3656 testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1 .. $]); 3657 testTsvSample(["test-b29b", "--line-buffered", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1 .. $]); 3658 3659 /* Multi-file tests. */ 3660 testTsvSample(["test-c1", "--header", "--static-seed", "--compatibility-mode", 3661 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3662 combo1ExpectedPermuteCompat); 3663 testTsvSample(["test-c2", "--header", "--static-seed", "--print-random", 3664 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3665 combo1ExpectedPermuteCompatProbs); 3666 testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3", 3667 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3668 combo1ExpectedPermuteWt3Probs); 3669 testTsvSample(["test-c3b", "--header", "--static-seed", "--print-random", "--weight-field", "field_c", 3670 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3671 combo1ExpectedPermuteWt3Probs); 3672 testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", "--compatibility-mode", 3673 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3674 combo1ExpectedPermuteWt3); 3675 testTsvSample(["test-c5", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", 3676 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3677 combo1ExpectedSampleAlgoRNum4); 3678 testTsvSample(["test-c5b", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder", 3679 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3680 combo1ExpectedSampleAlgoRNum4Inorder); 3681 3682 /* Multi-file, no headers. */ 3683 testTsvSample(["test-c6", "--static-seed", "--compatibility-mode", 3684 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3685 fpath_data3x6_noheader, fpath_data3x2_noheader], 3686 combo1ExpectedPermuteCompat[1 .. $]); 3687 testTsvSample(["test-c7", "--static-seed", "--print-random", 3688 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3689 fpath_data3x6_noheader, fpath_data3x2_noheader], 3690 combo1ExpectedPermuteCompatProbs[1 .. $]); 3691 testTsvSample(["test-c8", "--static-seed", "--print-random", "--weight-field", "3", 3692 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3693 fpath_data3x6_noheader, fpath_data3x2_noheader], 3694 combo1ExpectedPermuteWt3Probs[1 .. $]); 3695 testTsvSample(["test-c9", "--static-seed", "--weight-field", "3", "--compatibility-mode", 3696 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3697 fpath_data3x6_noheader, fpath_data3x2_noheader], 3698 combo1ExpectedPermuteWt3[1 .. $]); 3699 testTsvSample(["test-c10", "--static-seed", "--prefer-algorithm-r", "--num", "4", 3700 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3701 fpath_data3x6_noheader, fpath_data3x2_noheader], 3702 combo1ExpectedSampleAlgoRNum4[1 .. $]); 3703 testTsvSample(["test-c10b", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder", 3704 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3705 fpath_data3x6_noheader, fpath_data3x2_noheader], 3706 combo1ExpectedSampleAlgoRNum4Inorder[1 .. $]); 3707 3708 /* Bernoulli sampling cases. */ 3709 testTsvSample(["test-c11", "--header", "--static-seed", "--print-random", "--prob", ".5", 3710 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3711 combo1ExpectedBernoulliCompatP50Probs); 3712 testTsvSample(["test-c12", "--header", "--static-seed", "--prob", ".4", 3713 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3714 combo1ExpectedBernoulliCompatP40); 3715 testTsvSample(["test-c13", "--static-seed", "--print-random", "--prob", ".5", 3716 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3717 fpath_data3x6_noheader, fpath_data3x2_noheader], 3718 combo1ExpectedBernoulliCompatP50Probs[1 .. $]); 3719 testTsvSample(["test-c14", "--static-seed", "--prob", ".4", 3720 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3721 fpath_data3x6_noheader, fpath_data3x2_noheader], 3722 combo1ExpectedBernoulliCompatP40[1 .. $]); 3723 testTsvSample(["test-c14b", "--line-buffered", "--static-seed", "--prob", ".4", 3724 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3725 fpath_data3x6_noheader, fpath_data3x2_noheader], 3726 combo1ExpectedBernoulliCompatP40[1 .. $]); 3727 3728 /* Bernoulli sampling with probabilities in skip sampling range. */ 3729 testTsvSample(["test-cc1", "-H", "-v", "333", "-p", "0.03", 3730 fpath_data3x0, fpath_data3x1, fpath_data1x200, fpath_dataEmpty, fpath_data1x10], 3731 combo2ExpectedBernoulliSkipV333P03); 3732 testTsvSample(["test-cc2", "-v", "333", "-p", "0.03", 3733 fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader], 3734 combo2ExpectedBernoulliSkipV333P03[1 .. $]); 3735 testTsvSample(["test-cc3", "--line-buffered", "-v", "333", "-p", "0.03", 3736 fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader], 3737 combo2ExpectedBernoulliSkipV333P03[1 .. $]); 3738 3739 /* Distinct sampling cases. */ 3740 testTsvSample(["test-c15", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4", 3741 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3742 combo1ExpectedDistinctK1P40); 3743 testTsvSample(["test-c15b", "--header", "--static-seed", "--key-fields", "field_a", "--prob", ".4", 3744 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3745 combo1ExpectedDistinctK1P40); 3746 testTsvSample(["test-c15c", "--line-buffered", "--header", "--static-seed", "--key-fields", "field_a", "--prob", ".4", 3747 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3748 combo1ExpectedDistinctK1P40); 3749 testTsvSample(["test-c16", "--static-seed", "--key-fields", "1", "--prob", ".4", 3750 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3751 fpath_data3x6_noheader, fpath_data3x2_noheader], 3752 combo1ExpectedDistinctK1P40[1 .. $]); 3753 testTsvSample(["test-c16b", "--line-buffered", "--static-seed", "--key-fields", "1", "--prob", ".4", 3754 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3755 fpath_data3x6_noheader, fpath_data3x2_noheader], 3756 combo1ExpectedDistinctK1P40[1 .. $]); 3757 3758 /* Generating random weights. */ 3759 testTsvSample(["test-c17", "--header", "--static-seed", "--gen-random-inorder", 3760 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3761 combo1ExpectedProbsInorder); 3762 testTsvSample(["test-c18", "--static-seed", "--gen-random-inorder", 3763 fpath_data3x3_noheader, fpath_data3x1_noheader, 3764 fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader], 3765 combo1ExpectedProbsInorder[1 .. $]); 3766 3767 /* Simple random sampling with replacement. */ 3768 testTsvSample(["test-c19", "--header", "--static-seed", "--replace", "--num", "10", 3769 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3770 combo1ExpectedReplaceNum10); 3771 3772 testTsvSample(["test-c20", "--static-seed", "--replace", "--num", "10", 3773 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3774 fpath_data3x6_noheader, fpath_data3x2_noheader], 3775 combo1ExpectedReplaceNum10[1 .. $]); 3776 3777 /* Single column file. */ 3778 testTsvSample(["test-d1", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat); 3779 testTsvSample(["test-d2", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat); 3780 3781 /* Distributions. */ 3782 testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs); 3783 testTsvSample(["test-e1b", "-H", "-s", "-w", "weight", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs); 3784 testTsvSample(["test-e2", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedPermuteWt2Probs); 3785 testTsvSample(["test-e3", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedPermuteWt2Probs); 3786 testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs); 3787 testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs); 3788 3789 /* Tests of subset sample (--n|num) field. Random sampling, Bernoulli sampling, distinct sampling. 3790 * 3791 * Note: The way these tests are done ensures that subset length does not affect 3792 * output order. 3793 */ 3794 import std.algorithm : min; 3795 for (size_t n = data3x6.length + 2; n >= 1; n--) 3796 { 3797 /* reservoirSamplingViaHeap. 3798 */ 3799 size_t expectedLength = min(data3x6.length, n + 1); 3800 testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string, 3801 "-H", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]); 3802 3803 testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string, 3804 "-H", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]); 3805 3806 testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string, 3807 "-H", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs[0..expectedLength]); 3808 3809 testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string, 3810 "-H", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3[0..expectedLength]); 3811 3812 testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string, 3813 "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs[0..expectedLength]); 3814 3815 testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string, 3816 fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..expectedLength]); 3817 3818 testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string, 3819 "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..expectedLength]); 3820 3821 testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string, 3822 "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..expectedLength]); 3823 3824 testTsvSample([format("test-f9_%d", n), "-s", "-n", n.to!string, 3825 "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..expectedLength]); 3826 3827 /* Bernoulli sampling. 3828 */ 3829 import std.algorithm : min; 3830 size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedBernoulliCompatProbsP60.length); 3831 3832 testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 3833 "-H", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60[0..sampleExpectedLength]); 3834 3835 testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 3836 "-H", fpath_data3x6], data3x6ExpectedBernoulliCompatP60[0..sampleExpectedLength]); 3837 3838 testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 3839 "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..sampleExpectedLength]); 3840 3841 testTsvSample([format("test-f13_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 3842 fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60[1..sampleExpectedLength]); 3843 3844 /* Distinct Sampling. 3845 */ 3846 size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctK1K3P60.length); 3847 3848 testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, 3849 "-H", fpath_data3x6], data3x6ExpectedDistinctK1K3P60[0..distinctExpectedLength]); 3850 3851 testTsvSample([format("test-f15_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, 3852 fpath_data3x6_noheader], data3x6ExpectedDistinctK1K3P60[1..distinctExpectedLength]); 3853 3854 testTsvSample([format("test-f16_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, 3855 "-H", fpath_data3x6], data3x6ExpectedBernoulliProbsP100[0..expectedLength]); 3856 3857 testTsvSample([format("test-f17_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, 3858 fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]); 3859 } 3860 3861 /* Similar tests with the 1x10 data set. */ 3862 for (size_t n = data1x10.length + 2; n >= 1; n--) 3863 { 3864 size_t expectedLength = min(data1x10.length, n + 1); 3865 testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string, 3866 "-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]); 3867 3868 testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string, 3869 "-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]); 3870 3871 testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string, 3872 fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]); 3873 3874 testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string, 3875 "-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]); 3876 } 3877 3878 /* Simple random sampling with replacement: ensure sample size doesn't change order. */ 3879 for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--) 3880 { 3881 testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6], 3882 data3x6ExpectedReplaceNum10[0 .. n + 1]); 3883 3884 testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader], 3885 data3x6ExpectedReplaceNum10[1 .. n + 1]); 3886 } 3887 3888 /* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */ 3889 for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--) 3890 { 3891 size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1); 3892 3893 testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string, 3894 "-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]); 3895 3896 testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string, 3897 fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]); 3898 } 3899 3900 /* Inorder sampling tests using reservoir sampling via heap (compatibility mode). */ 3901 testTsvSample(["test-ar10", "--compatibility-mode", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty); 3902 testTsvSample(["test-ar11", "--compatibility-mode", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty); 3903 testTsvSample(["test-ar12", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0); 3904 testTsvSample(["test-ar13", "--compatibility-mode", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0); 3905 testTsvSample(["test-ar14", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1); 3906 testTsvSample(["test-ar15", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1); 3907 testTsvSample(["test-ar16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder); 3908 testTsvSample(["test-ar17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder); 3909 testTsvSample(["test-ar18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum5Inorder); 3910 testTsvSample(["test-ar19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum4Inorder); 3911 testTsvSample(["test-ar20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum3Inorder); 3912 testTsvSample(["test-ar21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum2Inorder); 3913 testTsvSample(["test-ar22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum1Inorder); 3914 3915 testTsvSample(["test-as10", "--compatibility-mode", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty); 3916 testTsvSample(["test-as11", "--compatibility-mode", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty); 3917 testTsvSample(["test-as14", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 3918 testTsvSample(["test-as15", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 3919 testTsvSample(["test-as16", "--compatibility-mode", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]); 3920 testTsvSample(["test-as17", "--compatibility-mode", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]); 3921 testTsvSample(["test-as18", "--compatibility-mode", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5Inorder[1 .. $]); 3922 testTsvSample(["test-as19", "--compatibility-mode", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4Inorder[1 .. $]); 3923 testTsvSample(["test-as20", "--compatibility-mode", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3Inorder[1 .. $]); 3924 testTsvSample(["test-as21", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2Inorder[1 .. $]); 3925 testTsvSample(["test-as22", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1Inorder[1 .. $]); 3926 3927 /* Inorder sampling tests with random number printing. --compatibility-mode not needed. */ 3928 testTsvSample(["test-at16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder); 3929 testTsvSample(["test-at17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder); 3930 testTsvSample(["test-at18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum5ProbsInorder); 3931 testTsvSample(["test-at19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder); 3932 testTsvSample(["test-at19", "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder); 3933 testTsvSample(["test-at20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder); 3934 testTsvSample(["test-at20", "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder); 3935 testTsvSample(["test-at21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum2ProbsInorder); 3936 testTsvSample(["test-at22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum1ProbsInorder); 3937 3938 testTsvSample(["test-au16", "--compatibility-mode", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]); 3939 testTsvSample(["test-au17", "--compatibility-mode", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]); 3940 testTsvSample(["test-au18", "--compatibility-mode", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5ProbsInorder[1 .. $]); 3941 testTsvSample(["test-au19", "--compatibility-mode", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]); 3942 testTsvSample(["test-au19", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]); 3943 testTsvSample(["test-au20", "--compatibility-mode", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3ProbsInorder[1 .. $]); 3944 testTsvSample(["test-au21", "--compatibility-mode", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2ProbsInorder[1 .. $]); 3945 testTsvSample(["test-au22", "--compatibility-mode", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1ProbsInorder[1 .. $]); 3946 3947 /* Inorder weighted sampling tests. */ 3948 testTsvSample(["test-ax16", "-H", "-s", "-n", "7", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder); 3949 testTsvSample(["test-ax17", "-H", "-s", "-n", "6", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder); 3950 testTsvSample(["test-ax18", "-H", "-s", "-n", "5", "-i", fpath_data3x6], data3x6ExpectedWt3Num5Inorder); 3951 testTsvSample(["test-ax19", "-H", "-s", "-n", "4", "-i", fpath_data3x6], data3x6ExpectedWt3Num4Inorder); 3952 testTsvSample(["test-ax20", "-H", "-s", "-n", "3", "-i", fpath_data3x6], data3x6ExpectedWt3Num3Inorder); 3953 testTsvSample(["test-ax21", "-H", "-s", "-n", "2", "-i", fpath_data3x6], data3x6ExpectedWt3Num2Inorder); 3954 testTsvSample(["test-ax22", "-H", "-s", "-n", "1", "-i", fpath_data3x6], data3x6ExpectedWt3Num1Inorder); 3955 3956 testTsvSample(["test-ay16", "-s", "-n", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]); 3957 testTsvSample(["test-ay17", "-s", "-n", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]); 3958 testTsvSample(["test-ay18", "-s", "-n", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num5Inorder[1 .. $]); 3959 testTsvSample(["test-ay19", "-s", "-n", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num4Inorder[1 .. $]); 3960 testTsvSample(["test-ay20", "-s", "-n", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num3Inorder[1 .. $]); 3961 testTsvSample(["test-ay21", "-s", "-n", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num2Inorder[1 .. $]); 3962 testTsvSample(["test-ay22", "-s", "-n", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num1Inorder[1 .. $]); 3963 3964 /* 3965 * Distinct sampling tests. 3966 */ 3967 testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25], 3968 data5x25ExpectedDistinctK2P40); 3969 3970 testTsvSample(["test-j1b", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "Shape", fpath_data5x25], 3971 data5x25ExpectedDistinctK2P40); 3972 3973 testTsvSample(["test-j2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25], 3974 data5x25ExpectedDistinctK2K4P20); 3975 3976 testTsvSample(["test-j2b", "-H", "-s", "-p", "0.20", "-k", "Shape,Size", fpath_data5x25], 3977 data5x25ExpectedDistinctK2K4P20); 3978 3979 testTsvSample(["test-j3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25], 3980 data5x25ExpectedDistinctK2K3K4P20); 3981 3982 testTsvSample(["test-j3b", "-H", "-s", "-p", "0.20", "-k", "Shape-Size", fpath_data5x25], 3983 data5x25ExpectedDistinctK2K3K4P20); 3984 3985 testTsvSample(["test-j4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader], 3986 data5x25ExpectedDistinctK2P40[1 .. $]); 3987 3988 testTsvSample(["test-j5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader], 3989 data5x25ExpectedDistinctK2K4P20[1 .. $]); 3990 3991 testTsvSample(["test-j6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader], 3992 data5x25ExpectedDistinctK2K3K4P20[1 .. $]); 3993 3994 3995 /* These distinct tests check that the whole line as '-k 0' and specifying all fields 3996 * in order have the same result. Also that field numbers don't matter, as '-k 1,2' 3997 * in data2x25 are the same keys as '-k 2,4' in data5x25. 3998 */ 3999 testTsvSample(["test-j7", "-H", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25], 4000 data2x25ExpectedDistinctK1K2P20); 4001 4002 testTsvSample(["test-j8", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data2x25], 4003 data2x25ExpectedDistinctK1K2P20); 4004 4005 testTsvSample(["test-j8b", "-H", "-s", "-p", "0.20", "-k", "*", fpath_data2x25], 4006 data2x25ExpectedDistinctK1K2P20); 4007 4008 testTsvSample(["test-j9", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25_noheader], 4009 data2x25ExpectedDistinctK1K2P20[1 .. $]); 4010 4011 testTsvSample(["test-j10", "-s", "-p", "0.20", "-k", "0", fpath_data2x25_noheader], 4012 data2x25ExpectedDistinctK1K2P20[1 .. $]); 4013 4014 /* Similar to the last set, but for a 1-column file. Also with random value printing. */ 4015 testTsvSample(["test-j11", "-H", "-s", "-p", "0.20", "-k", "1", fpath_data1x25], 4016 data1x25ExpectedDistinctK1P20); 4017 4018 testTsvSample(["test-j12", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data1x25], 4019 data1x25ExpectedDistinctK1P20); 4020 4021 testTsvSample(["test-j12b", "-H", "-s", "-p", "0.20", "-k", "*", fpath_data1x25], 4022 data1x25ExpectedDistinctK1P20); 4023 4024 testTsvSample(["test-j13", "-s", "-p", "0.20", "-k", "1", fpath_data1x25_noheader], 4025 data1x25ExpectedDistinctK1P20[1 .. $]); 4026 4027 testTsvSample(["test-j14", "-s", "-p", "0.20", "-k", "0", fpath_data1x25_noheader], 4028 data1x25ExpectedDistinctK1P20[1 .. $]); 4029 4030 testTsvSample(["test-j15", "-H", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25], 4031 data1x25ExpectedDistinctK1P20Probs); 4032 4033 testTsvSample(["test-j15b", "-H", "-s", "-p", "0.20", "-k", `Shape\-Size`, "--print-random", fpath_data1x25], 4034 data1x25ExpectedDistinctK1P20Probs); 4035 4036 testTsvSample(["test-j16", "-H", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25], 4037 data1x25ExpectedDistinctK1P20Probs); 4038 4039 testTsvSample(["test-j16b", "-H", "-s", "-p", "0.20", "-k", "*", "--print-random", fpath_data1x25], 4040 data1x25ExpectedDistinctK1P20Probs); 4041 4042 testTsvSample(["test-j17", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25_noheader], 4043 data1x25ExpectedDistinctK1P20Probs[1 .. $]); 4044 4045 testTsvSample(["test-j18", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25_noheader], 4046 data1x25ExpectedDistinctK1P20Probs[1 .. $]); 4047 4048 testTsvSample(["test-j19", "-H", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25], 4049 data1x25ExpectedDistinctK1P20ProbsInorder); 4050 4051 testTsvSample(["test-j19b", "-H", "-s", "-p", "0.20", "-k", `Shape\-Size`, "--gen-random-inorder", fpath_data1x25], 4052 data1x25ExpectedDistinctK1P20ProbsInorder); 4053 4054 testTsvSample(["test-j20", "-H", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25], 4055 data1x25ExpectedDistinctK1P20ProbsInorder); 4056 4057 testTsvSample(["test-j20b", "-H", "-s", "-p", "0.20", "-k", "*", "--gen-random-inorder", fpath_data1x25], 4058 data1x25ExpectedDistinctK1P20ProbsInorder); 4059 4060 testTsvSample(["test-j21", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25_noheader], 4061 data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]); 4062 4063 testTsvSample(["test-j22", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25_noheader], 4064 data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]); 4065 4066 }