1 /** 2 Command line tool for splitting a files (or files) into multiple output files. 3 Several methods for splitting are available, including splitting by line count, 4 splitting by random assignment, and splitting by random assignment based on 5 key fields. 6 7 Copyright (c) 2020, eBay Inc. 8 Initially written by Jon Degenhardt 9 10 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 11 */ 12 module tsv_utils.tsv_split; 13 14 import std.exception : enforce; 15 import std.format : format; 16 import std.range; 17 import std.stdio; 18 import std.typecons : tuple, Flag; 19 20 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 21 22 version(unittest) 23 { 24 // When running unit tests, use main from -main compiler switch. 25 } 26 else 27 { 28 /** Main program. 29 * 30 * Invokes command line argument processing and calls tsvSplit to do the real 31 * work. Errors occurring during processing are caught and reported to the user. 32 */ 33 int main(string[] cmdArgs) 34 { 35 /* When running in DMD code coverage mode, turn on report merging. */ 36 version(D_Coverage) version(DigitalMars) 37 { 38 import core.runtime : dmd_coverSetMerge; 39 dmd_coverSetMerge(true); 40 } 41 42 TsvSplitOptions cmdopt; 43 const r = cmdopt.processArgs(cmdArgs); 44 if (!r[0]) return r[1]; 45 version(LDC_Profile) 46 { 47 import ldc.profile : resetAll; 48 resetAll(); 49 } 50 try 51 { 52 tsvSplit(cmdopt); 53 } 54 catch (Exception exc) 55 { 56 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 57 return 1; 58 } 59 return 0; 60 } 61 } 62 63 immutable helpText = q"EOS 64 Synopsis: tsv-split [options] [file...] 65 66 Split input lines into multiple output files. There are three modes of 67 operation: 68 69 * Fixed number of lines per file (--l|lines-per-file NUM): Each input 70 block of NUM lines is written to a new file. Similar to Unix 'split'. 71 72 * Random assignment (--n|num-files NUM): Each input line is written to a 73 randomly selected output file. Random selection is from NUM files. 74 75 * Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): 76 Input lines are written to output files using fields as a key. Each 77 unique key is randomly assigned to one of NUM output files. All lines 78 with the same key are written to the same file. 79 80 By default, files are written to the current directory and have names 81 of the form 'part_NNN<suffix>', with 'NNN' being a number and <suffix> 82 being the extension of the first input file. If the input file is 83 'file.txt', the names will take the form 'part_NNN.txt'. The output 84 directory and file names are customizable. 85 86 Fields are specified using field number or field name. Field names 87 require that the input file has a header line. 88 89 Use '--help-verbose' for more detailed information. 90 91 Options: 92 EOS"; 93 94 immutable helpTextVerbose = q"EOS 95 Synopsis: tsv-split [options] [file...] 96 97 Split input lines into multiple output files. There are three modes of 98 operation: 99 100 * Fixed number of lines per file (--l|lines-per-file NUM): Each input 101 block of NUM lines is written to a new file. Similar to Unix 'split'. 102 103 * Random assignment (--n|num-files NUM): Each input line is written to a 104 randomly selected output file. Random selection is from NUM files. 105 106 * Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): 107 Input lines are written to output files using fields as a key. Each 108 unique key is randomly assigned to one of NUM output files. All lines 109 with the same key are written to the same file. 110 111 Output files: By default, files are written to the current directory and 112 have names of the form 'part_NNN<suffix>', with 'NNN' being a number and 113 <suffix> being the extension of the first input file. If the input file is 114 'file.txt', the names will take the form 'part_NNN.txt'. The suffix is 115 empty when reading from standard input. The numeric part defaults to 3 116 digits for '--l|lines-per-files'. For '--n|num-files' enough digits are 117 used so all filenames are the same length. The output directory and file 118 names are customizable. 119 120 Header lines: There are two ways to handle input with headers: write a 121 header to all output files (--H|header), or exclude headers from all 122 output files ('--I|header-in-only'). The best choice depends on the 123 follow-up processing. All tsv-utils tools support header lines in multiple 124 input files, but many other tools do not. For example, GNU parallel works 125 best on files without header lines. 126 127 Random assignment (--n|num-files): Random distribution of records to a set 128 of files is a common task. When data fits in memory the preferred approach 129 is usually to shuffle the data and split it into fixed sized blocks. E.g. 130 'tsv-sample data.tsv | tsv-split -l NUM'. However, alternate approaches 131 are needed when data is too large for convenient shuffling. tsv-split's 132 random assignment feature is useful in this case. Each input line is 133 written to a randomly selected output file. Note that output files will 134 have similar but not identical numbers of records. 135 136 Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): This 137 splits a data set into multiple files sharded by key. All lines with the 138 same key are written to the same file. This partitioning enables parallel 139 computation based on the key. For example, statistical calculation 140 ('tsv-summarize --group-by') or duplicate removal ('tsv-uniq --fields'). 141 These operations can be parallelized using tools like GNU parallel, which 142 simplifies concurrent operations on multiple files. Fields are specified 143 using field number or field name. Field names require that the input file 144 has a header line. Use '--help-fields' for details about field names. 145 146 Random seed: By default, each tsv-split invocation using random assignment 147 or random assignment by key produces different assignments to the output 148 files. Using '--s|static-seed' changes this so multiple runs produce the 149 same assignments. This works by using the same random seed each run. The 150 seed can be specified using '--v|seed-value'. 151 152 Appending to existing files: By default, an error is triggered if an 153 output file already exists. '--a|append' changes this so that lines are 154 appended to existing files. (Header lines are not appended to files with 155 data.) This is useful when adding new data to files created by a previous 156 tsv-split run. Random assignment should use the same '--n|num-files' value 157 each run, but different random seeds (avoid '--s|static-seed'). Random 158 assignment by key should use the same '--n|num-files', '--k|key-fields', 159 and seed ('--s|static-seed' or '--v|seed-value') each run. 160 161 Max number of open files: Random assignment and random assignment by key 162 are dramatically faster when all output files are kept open. However, 163 keeping a large numbers of open files can bump into system limits or limit 164 resources available to other processes. By default, tsv-split uses up to 165 4096 open files or the system per-process limit, whichever is smaller. 166 This can be changed using '--max-open-files', though it cannot be set 167 larger than the system limit. The system limit varies considerably between 168 systems. On many systems it is unlimited. On MacOS it is often set to 256. 169 Use Unix 'ulimit' to display and modify the limits: 170 * 'ulimit -n' - Show the "soft limit". The per-process maximum. 171 * 'ulimit -Hn' - Show the "hard limit". The max allowed soft limit. 172 * 'ulimit -Sn NUM' - Change the "soft limit" to NUM. 173 174 Examples: 175 176 # Split a 10 million line file into 1000 files, 10,000 lines each. 177 # Output files are part_000.txt, part_001.txt, ... part_999.txt. 178 tsv-split data.txt --lines-per-file 10000 179 180 # Same as the previous example, but write files to a subdirectory. 181 tsv-split data.txt --dir split_files --lines-per-file 10000 182 183 # Split a file into 10,000 line files, writing a header line to each 184 tsv-split data.txt -H --lines-per-file 10000 185 186 # Same as the previous example, but dropping the header line. 187 tsv-split data.txt -I --lines-per-file 10000 188 189 # Randomly assign lines to 1000 files 190 tsv-split data.txt --num-files 1000 191 192 # Randomly assign lines to 1000 files while keeping unique entries 193 # from the 'url' field together. 194 tsv-split data.tsv -H -k url --num-files 1000 195 196 # Randomly assign lines to 1000 files. Later, randomly assign lines 197 # from a second data file to the same output files. 198 tsv-split data1.tsv -n 1000 199 tsv-split data2.tsv -n 1000 --append 200 201 # Randomly assign lines to 1000 files using field 3 as a key. 202 # Later, add a second file to the same output files. 203 tsv-split data1.tsv -n 1000 -k 3 --static-seed 204 tsv-split data2.tsv -n 1000 -k 3 --static-seed --append 205 206 # Change the system per-process open file limit for one command. 207 # The parens create a sub-shell. The current shell is not changed. 208 ( ulimit -Sn 1000 && tsv-split --num-files 1000 data.txt ) 209 210 Options: 211 EOS"; 212 213 /** Container for command line options and derived data. 214 * 215 * TsvSplitOptions handles several aspects of command line options. On the input side, 216 * it defines the command line options available, performs validation, and sets up any 217 * derived state based on the options provided. These activities are handled by the 218 * processArgs() member. 219 * 220 * Once argument processing is complete, TsvSplitOptions is used as a container 221 * holding the specific processing options used by the splitting algorithms. 222 */ 223 struct TsvSplitOptions 224 { 225 import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader; 226 227 enum invalidFileSuffix = "///////"; 228 229 string programName; /// Program name 230 InputSourceRange inputSources; /// Input files 231 bool headerInOut = false; /// --H|header 232 bool headerIn = false; /// --I|header-in-only 233 size_t linesPerFile = 0; /// --l|lines-per-file 234 uint numFiles = 0; /// --n|num-files 235 size_t[] keyFields; /// Derived: --k|key-fields 236 string dir; /// --dir 237 string prefix = "part_"; /// --prefix 238 string suffix = invalidFileSuffix; /// --suffix 239 uint digitWidth = 0; /// --w|digit-width 240 bool appendToExistingFiles = false; /// --a|append 241 bool staticSeed = false; /// --s|static-seed 242 uint seedValueOptionArg = 0; /// --v|seed-value 243 char delim = '\t'; /// --d|delimiter 244 uint maxOpenFilesArg = 0; /// --max-open-files 245 bool hasHeader = false; /// Derived. True if either '--H|header' or '--I|header-in-only' is set. 246 bool keyIsFullLine = false; /// Derived. True if '--f|fields 0' is specfied. 247 bool usingUnpredictableSeed = true; /// Derived from --static-seed, --seed-value 248 uint seed = 0; /// Derived from --static-seed, --seed-value 249 uint maxOpenOutputFiles; /// Derived. 250 251 /** Process tsv-split command line arguments. 252 * 253 * Defines the command line options, performs validation, and derives additional 254 * state. std.getopt.getopt is called to do the main option processing followed 255 * additional validation and derivation. 256 * 257 * Help text is printed to standard output if help was requested. Error text is 258 * written to stderr if invalid input is encountered. 259 * 260 * A tuple is returned. First value is true if command line arguments were 261 * successfully processed and execution should continue, or false if an error 262 * occurred or the user asked for help. If false, the second value is the 263 * appropriate exit code (0 or 1). 264 * 265 * Returning true (execution continues) means args have been validated and derived 266 * values calculated. Field indices will have been converted to zero-based. 267 */ 268 auto processArgs(ref string[] cmdArgs) 269 { 270 import std.algorithm : all, canFind, each, min; 271 import std.conv : to; 272 import std.file : exists, isDir; 273 import std.getopt; 274 import std.math : isNaN; 275 import std.path : baseName, expandTilde, extension, stripExtension; 276 import std.typecons : Yes, No; 277 import tsv_utils.common.fieldlist; 278 279 bool helpVerbose = false; // --help-verbose 280 bool helpFields = false; // --help-fields 281 bool versionWanted = false; // --V|version 282 string keyFieldsArg; // --k|key-fields 283 284 string keyFieldsOptionString = "k|key-fields"; 285 286 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 287 288 try 289 { 290 arraySep = ","; // Use comma to separate values in command line options 291 auto r = getopt( 292 cmdArgs, 293 "help-verbose", " Print more detailed help.", &helpVerbose, 294 "help-fields", " Print help on specifying fields.", &helpFields, 295 296 std.getopt.config.caseSensitive, 297 "H|header", " Input files have a header line. Write the header to each output file.", &headerInOut, 298 "I|header-in-only", " Input files have a header line. Do not write the header to output files.", &headerIn, 299 std.getopt.config.caseInsensitive, 300 301 "l|lines-per-file", "NUM Number of lines to write to each output file (excluding the header line).", &linesPerFile, 302 "n|num-files", "NUM Number of output files to generate.", &numFiles, 303 304 keyFieldsOptionString, 305 "<field-list> Fields to use as key. Lines with the same key are written to the same output file. Use '--k|key-fields 0' to use the entire line as the key.", 306 &keyFieldsArg, 307 308 "dir", "STR Directory to write to. Default: Current working directory.", &dir, 309 "prefix", "STR Filename prefix. Default: 'part_'", &prefix, 310 "suffix", "STR Filename suffix. Default: First input file extension. None for standard input.", &suffix, 311 "w|digit-width", "NUM Number of digits in filename numeric portion. Default: '--l|lines-per-file': 3. '--n|num-files': Chosen so filenames have the same length. '--w|digit-width 0' uses the default.", &digitWidth, 312 "a|append", " Append to existing files.", &appendToExistingFiles, 313 314 "s|static-seed", " Use the same random seed every run.", &staticSeed, 315 316 std.getopt.config.caseSensitive, 317 "v|seed-value", "NUM Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg, 318 std.getopt.config.caseInsensitive, 319 320 "d|delimiter", "CHR Field delimiter.", &delim, 321 "max-open-files", "NUM Maximum open file handles to use. Min of 5 required.", &maxOpenFilesArg, 322 323 std.getopt.config.caseSensitive, 324 "V|version", " Print version information and exit.", &versionWanted, 325 std.getopt.config.caseInsensitive, 326 ); 327 328 if (r.helpWanted) 329 { 330 defaultGetoptPrinter(helpText, r.options); 331 return tuple(false, 0); 332 } 333 else if (helpVerbose) 334 { 335 defaultGetoptPrinter(helpTextVerbose, r.options); 336 return tuple(false, 0); 337 } 338 else if (helpFields) 339 { 340 writeln(fieldListHelpText); 341 return tuple(false, 0); 342 } 343 else if (versionWanted) 344 { 345 import tsv_utils.common.tsvutils_version; 346 writeln(tsvutilsVersionNotice("tsv-split")); 347 return tuple(false, 0); 348 } 349 350 /* Remaining command line args are files. 351 */ 352 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 353 cmdArgs.length = 1; 354 355 /* Validation and derivations - Do as much validation prior to header line 356 * processing as possible (avoids waiting on stdin). 357 * 358 * Note: keyFields depends on header line processing, but keyFieldsArg 359 * can be used to detect whether the command line argument was specified. 360 */ 361 362 enforce(!(headerInOut && headerIn), 363 "Use only one of '--H|header' and '--I|header-in-only'."); 364 365 hasHeader = headerInOut || headerIn; 366 367 enforce(linesPerFile != 0 || numFiles != 0, 368 "Either '--l|lines-per-file' or '--n|num-files' is required."); 369 370 enforce(linesPerFile == 0 || numFiles == 0, 371 "'--l|lines-per-file' and '--n|num-files' cannot be used together."); 372 373 enforce(linesPerFile == 0 || keyFieldsArg.length == 0, 374 "'--l|lines-per-file' and '--k|key-fields' cannot be used together."); 375 376 enforce(numFiles != 1, "'--n|num-files must be two or more."); 377 378 if (!dir.empty) 379 { 380 dir = dir.expandTilde; 381 enforce(dir.exists, format("Directory does not exist: --dir '%s'", dir)); 382 enforce(dir.isDir, format("Path is not a directory: --dir '%s'", dir)); 383 } 384 385 /* Seed. */ 386 import std.random : unpredictableSeed; 387 388 usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0); 389 390 if (usingUnpredictableSeed) seed = unpredictableSeed; 391 else if (seedValueOptionArg != 0) seed = seedValueOptionArg; 392 else if (staticSeed) seed = 2438424139; 393 else assert(0, "Internal error, invalid seed option states."); 394 395 /* Maximum number of open files. Mainly applies when --num-files is used. 396 * 397 * Derive maxOpenOutputFiles. Inputs: 398 * - Internal default limit: 4096. This is a somewhat conservative setting. 399 * - rlimit open files limit. Defined by '$ ulimit -n'. 400 * - '--max-open-files' (maxOpenFilesArg). This adjusts the internal limit, 401 * but only up to the rlimit value. 402 * - Four open files are reserved for stdin, stdout, stderr, and one input 403 * file. 404 */ 405 406 immutable uint internalDefaultMaxOpenFiles = 4096; 407 immutable uint numReservedOpenFiles = 4; 408 immutable uint rlimitOpenFilesLimit = rlimitCurrOpenFilesLimit(); 409 410 enforce(maxOpenFilesArg == 0 || maxOpenFilesArg > numReservedOpenFiles, 411 format("'--max-open-files' must be at least %d.", 412 numReservedOpenFiles + 1)); 413 414 enforce(maxOpenFilesArg <= rlimitOpenFilesLimit, 415 format("'--max-open-files' value (%d) greater current system limit (%d)." ~ 416 "\nRun 'ulimit -n' to see the soft limit." ~ 417 "\nRun 'ulimit -Hn' to see the hard limit." ~ 418 "\nRun 'ulimit -Sn NUM' to change the soft limit.", 419 maxOpenFilesArg, rlimitOpenFilesLimit)); 420 421 enforce(rlimitOpenFilesLimit > numReservedOpenFiles, 422 format("System open file limit too small. Current value: %d. Must be %d or more." ~ 423 "\nRun 'ulimit -n' to see the soft limit." ~ 424 "\nRun 'ulimit -Hn' to see the hard limit." ~ 425 "\nRun 'ulimit -Sn NUM' to change the soft limit.", 426 rlimitOpenFilesLimit, numReservedOpenFiles + 1)); 427 428 immutable uint openFilesLimit = 429 (maxOpenFilesArg != 0) 430 ? maxOpenFilesArg 431 : min(internalDefaultMaxOpenFiles, rlimitOpenFilesLimit); 432 433 assert(openFilesLimit > numReservedOpenFiles); 434 435 maxOpenOutputFiles = openFilesLimit - numReservedOpenFiles; 436 437 /* Suffix - If not provided, use the extension of the first input file. 438 * No suffix if reading from standard input. 439 */ 440 if (suffix == invalidFileSuffix) suffix = filepaths[0].extension; 441 442 /* Ensure forward slash is not included in the filename prefix and suffix. 443 * Forward slash is an invalid Unix filename character. However, open file 444 * calls could match a directory path, resulting in unintended file 445 * creation. 446 * 447 * The other invalid filename character on Unix is the NULL character. 448 * However, the NULL character cannot be entered via Unix command lines, 449 * so there is no need to test for it explicitly. 450 */ 451 enforce(!prefix.canFind('/'), 452 "'--prefix' cannot contain forward slash characters. Use '--dir' to specify an output directory."); 453 454 enforce(!suffix.canFind('/'), 455 "'--suffix' cannot contain forward slash characters. Use '--dir' to specify an output directory."); 456 457 /* Digit width - If not specified, or specified as zero, the width is 458 * determined by the number of files for --num-files, or defaulted to 3 459 * for --lines-per-file. 460 */ 461 if (digitWidth == 0) 462 { 463 if (numFiles > 0) 464 { 465 digitWidth = 1; 466 uint n = numFiles - 1; 467 while (n >= 10) 468 { 469 n /= 10; 470 ++digitWidth; 471 } 472 } 473 else 474 { 475 digitWidth = 3; 476 } 477 } 478 assert(digitWidth != 0); 479 480 /* 481 * Create the inputSourceRange and perform header line processing. 482 */ 483 ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader; 484 inputSources = inputSourceRange(filepaths, readHeader); 485 486 string[] headerFields; 487 488 if (hasHeader) headerFields = inputSources.front.header.split(delim).to!(string[]); 489 490 if (!keyFieldsArg.empty) 491 { 492 keyFields = 493 keyFieldsArg 494 .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero) 495 (hasHeader, headerFields, keyFieldsOptionString) 496 .array; 497 } 498 499 if (keyFields.length > 0) 500 { 501 if (keyFields.length == 1 && keyFields[0] == 0) 502 { 503 keyIsFullLine = true; 504 } 505 else 506 { 507 enforce(keyFields.all!(x => x != 0), 508 "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields."); 509 510 keyFields.each!((ref x) => --x); // Convert to zero-based indexing. 511 } 512 } 513 514 } 515 catch (Exception exc) 516 { 517 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 518 return tuple(false, 1); 519 } 520 return tuple(true, 0); 521 } 522 } 523 524 /* TsvSplitOptions unit tests (command-line argument processing). 525 * 526 * Basic tests. Many cases are covered in executable tests, including all error cases, 527 * as errors write to stderr. 528 */ 529 unittest 530 { 531 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 532 import std.conv : to; 533 import std.file : mkdir, rmdirRecurse; 534 import std.path : buildPath; 535 536 /* A dummy file is used so we don't have to worry about the cases where command 537 * line processing might open a file. Don't want to use standard input for this, 538 * at least in cases where it might try to read to get the header line. 539 */ 540 auto testDir = makeUnittestTempDir("tsv_split_bylinecount"); 541 scope(exit) testDir.rmdirRecurse; 542 543 string somefile_txt = buildPath(testDir, "somefile.txt"); 544 somefile_txt.File("w").writeln("Hello World!"); 545 546 { 547 auto args = ["unittest", "--lines-per-file", "10", somefile_txt]; 548 TsvSplitOptions cmdopt; 549 const r = cmdopt.processArgs(args); 550 551 assert(cmdopt.linesPerFile == 10); 552 assert(cmdopt.keyFields.empty); 553 assert(cmdopt.numFiles == 0); 554 assert(cmdopt.hasHeader == false); 555 } 556 { 557 auto args = ["unittest", "--num-files", "20", somefile_txt]; 558 TsvSplitOptions cmdopt; 559 const r = cmdopt.processArgs(args); 560 561 assert(cmdopt.linesPerFile == 0); 562 assert(cmdopt.keyFields.empty); 563 assert(cmdopt.numFiles == 20); 564 assert(cmdopt.hasHeader == false); 565 } 566 { 567 auto args = ["unittest", "-n", "5", "--key-fields", "1-3", somefile_txt]; 568 TsvSplitOptions cmdopt; 569 const r = cmdopt.processArgs(args); 570 571 assert(cmdopt.linesPerFile == 0); 572 assert(cmdopt.keyFields == [0, 1, 2]); 573 assert(cmdopt.numFiles == 5); 574 assert(cmdopt.hasHeader == false); 575 assert(cmdopt.keyIsFullLine == false); 576 } 577 { 578 auto args = ["unittest", "-n", "5", "-k", "0", somefile_txt]; 579 TsvSplitOptions cmdopt; 580 const r = cmdopt.processArgs(args); 581 582 assert(cmdopt.linesPerFile == 0); 583 assert(cmdopt.numFiles == 5); 584 assert(cmdopt.hasHeader == false); 585 assert(cmdopt.keyIsFullLine == true); 586 } 587 { 588 auto args = ["unittest", "-n", "2", "--header", somefile_txt]; 589 TsvSplitOptions cmdopt; 590 const r = cmdopt.processArgs(args); 591 592 assert(cmdopt.headerInOut == true); 593 assert(cmdopt.hasHeader == true); 594 assert(cmdopt.headerIn == false); 595 } 596 { 597 auto args = ["unittest", "-n", "2", "--header-in-only", somefile_txt]; 598 TsvSplitOptions cmdopt; 599 const r = cmdopt.processArgs(args); 600 601 assert(cmdopt.headerInOut == false); 602 assert(cmdopt.hasHeader == true); 603 assert(cmdopt.headerIn == true); 604 } 605 606 static void testSuffix(string[] args, string expectedSuffix) 607 { 608 TsvSplitOptions cmdopt; 609 auto savedArgs = args.to!string; 610 const r = cmdopt.processArgs(args); 611 612 assert(r[0], format("[testSuffix] cmdopt.processArgs(%s) returned false.", savedArgs)); 613 assert(cmdopt.suffix == expectedSuffix, 614 format("[testSuffix] Incorrect cmdopt.suffix. Expected: '%s', Actual: '%s'\n cmdopt.processArgs(%s)", 615 expectedSuffix, cmdopt.suffix, savedArgs)); 616 } 617 618 /* In these tests, don't use headers and when files are listed, use 'somefile_txt' first. 619 * This makes sure there is no attempt to read standard input and that there won't be an 620 * open failure trying to find a file. 621 */ 622 testSuffix(["unittest", "-n", "2"], ""); 623 testSuffix(["unittest", "-n", "2", "--", "-"], ""); 624 testSuffix(["unittest", "-n", "2", "--suffix", "_123"], "_123"); 625 testSuffix(["unittest", "-n", "2", somefile_txt], ".txt"); 626 testSuffix(["unittest", "-n", "2", somefile_txt, "anotherfile.pqr"], ".txt"); 627 testSuffix(["unittest", "-n", "2", "--suffix", ".X", somefile_txt, "anotherfile.pqr"], ".X"); 628 testSuffix(["unittest", "-n", "2", "--suffix", "", somefile_txt], ""); 629 testSuffix(["unittest", "-n", "2", "--", "-", somefile_txt], ""); 630 testSuffix(["unittest", "-n", "2", "--", somefile_txt, "-"], ".txt"); 631 632 static void testDigitWidth(string[] args, uint expected) 633 { 634 TsvSplitOptions cmdopt; 635 auto savedArgs = args.to!string; 636 const r = cmdopt.processArgs(args); 637 638 assert(r[0], format("[testDigitWidth] cmdopt.processArgs(%s) returned false.", savedArgs)); 639 assert(cmdopt.digitWidth == expected, 640 format("[testDigitWidth] Incorrect cmdopt.digitWidth. Expected: %d, Actual: %d\n cmdopt.processArgs(%s)", 641 expected, cmdopt.digitWidth, savedArgs)); 642 } 643 644 testDigitWidth(["unittest", "-n", "2", somefile_txt], 1); 645 testDigitWidth(["unittest", "-n", "2", "--digit-width" , "0", somefile_txt], 1); 646 testDigitWidth(["unittest", "-n", "10", somefile_txt], 1); 647 testDigitWidth(["unittest", "-n", "11", somefile_txt], 2); 648 testDigitWidth(["unittest", "-n", "555", somefile_txt], 3); 649 testDigitWidth(["unittest", "-n", "555", "--digit-width" , "2", somefile_txt], 2); 650 testDigitWidth(["unittest", "-n", "555", "--digit-width" , "4", somefile_txt], 4); 651 testDigitWidth(["unittest", "-l", "10", somefile_txt], 3); 652 testDigitWidth(["unittest", "-l", "10000", somefile_txt], 3); 653 testDigitWidth(["unittest", "-l", "10000", "--digit-width", "0", somefile_txt], 3); 654 testDigitWidth(["unittest", "-l", "10000", "--digit-width", "1", somefile_txt], 1); 655 testDigitWidth(["unittest", "-l", "10000", "--digit-width", "5", somefile_txt], 5); 656 } 657 658 /** Get the rlimit current number of open files the process is allowed. 659 * 660 * This routine returns the current soft limit on the number of open files the process 661 * is allowed. This is the number returned by the command: '$ ulimit -n'. 662 * 663 * This routine translates this value to a 'uint', as tsv-split uses 'uint' for 664 * tracking output files. The rlimit 'rlim_t' type is usually 'ulong' or 'long'. 665 * RLIM_INFINITY and any value larger than 'uint.max' is translated to 'uint.max'. 666 * 667 * An exception is thrown if call to 'getrlimit' fails. 668 */ 669 uint rlimitCurrOpenFilesLimit() 670 { 671 import core.sys.posix.sys.resource : 672 rlim_t, rlimit, getrlimit, RLIMIT_NOFILE, RLIM_INFINITY, RLIM_SAVED_CUR; 673 import std.conv : to; 674 675 uint currOpenFileLimit = uint.max; 676 677 rlimit rlimitMaxOpenFiles; 678 679 enforce(getrlimit(RLIMIT_NOFILE, &rlimitMaxOpenFiles) == 0, 680 "Internal error: getrlimit call failed"); 681 682 if (rlimitMaxOpenFiles.rlim_cur != RLIM_INFINITY && 683 rlimitMaxOpenFiles.rlim_cur != RLIM_SAVED_CUR && 684 rlimitMaxOpenFiles.rlim_cur >= 0 && 685 rlimitMaxOpenFiles.rlim_cur <= uint.max) 686 { 687 currOpenFileLimit = rlimitMaxOpenFiles.rlim_cur.to!uint; 688 } 689 690 return currOpenFileLimit; 691 } 692 693 /** Invokes the proper split routine based on the command line arguments. 694 * 695 * This routine is the top-level control after command line argument processing is 696 * done. It's primary job is to set up data structures and invoke the correct 697 * processing routine based on the command line arguments. 698 */ 699 void tsvSplit(ref TsvSplitOptions cmdopt) 700 { 701 /* Check that the input files were setup as expected. Should at least have one 702 * input, stdin if nothing else. */ 703 assert(!cmdopt.inputSources.empty); 704 705 if (cmdopt.linesPerFile != 0) 706 { 707 splitByLineCount(cmdopt); 708 } 709 else 710 { 711 /* Randomly distribute input lines to a specified number of files. */ 712 713 auto outputFiles = 714 SplitOutputFiles(cmdopt.numFiles, cmdopt.dir, cmdopt.prefix, cmdopt.suffix, 715 cmdopt.digitWidth, cmdopt.headerInOut, cmdopt.maxOpenOutputFiles, 716 cmdopt.inputSources.front.header); 717 718 if (!cmdopt.appendToExistingFiles) 719 { 720 string existingFile = outputFiles.checkIfFilesExist; 721 enforce(existingFile.length == 0, 722 format("One or more output files already exist. Use '--a|append' to append to existing files. File: '%s'.", 723 existingFile)); 724 } 725 726 if (cmdopt.keyFields.length == 0) 727 { 728 splitLinesRandomly(cmdopt, outputFiles); 729 } 730 else 731 { 732 splitLinesByKey(cmdopt, outputFiles); 733 } 734 } 735 } 736 737 /** A SplitOutputFiles struct holds a collection of output files. 738 * 739 * This struct manages a collection of output files used when writing to multiple 740 * files at once. This includes constructing filenames, opening and closing files, 741 * and writing data and header lines. 742 * 743 * Both random assignment (splitLinesRandomly) and random assignment by key 744 * (splitLinesByKey) use a SplitOutputFiles struct to manage output files. 745 * 746 * The main properties of the output file set are specified in the constuctor. The 747 * exception is the header line. This is not known until the first input file is 748 * read, so it is specified in a separate 'setHeader' call. 749 * 750 * Individual output files are written to based on their zero-based index in the 751 * output collection. The caller selects the output file number to write to and 752 * calls 'writeDataLine' to write a line. The header is written if needed. 753 */ 754 struct SplitOutputFiles 755 { 756 import std.conv : to; 757 import std.file : exists; 758 import std.path : buildPath; 759 import std.stdio : File; 760 761 static struct OutputFile 762 { 763 string filename; 764 File ofile; 765 bool hasData; 766 bool isOpen; // Track separately due to https://github.com/dlang/phobos/pull/7397 767 } 768 769 private uint _numFiles; 770 private bool _writeHeaders; 771 private uint _maxOpenFiles; 772 773 private OutputFile[] _outputFiles; 774 private uint _numOpenFiles = 0; 775 private string _header; 776 777 this(uint numFiles, string dir, string filePrefix, string fileSuffix, 778 uint fileDigitWidth, bool writeHeaders, uint maxOpenFiles, string header) 779 { 780 assert(numFiles >= 2); 781 assert(maxOpenFiles >= 1); 782 783 _numFiles = numFiles; 784 _writeHeaders = writeHeaders; 785 _maxOpenFiles = maxOpenFiles; 786 _header = header; 787 788 _outputFiles.length = numFiles; 789 790 /* Filename assignment. */ 791 foreach (i, ref f; _outputFiles) 792 { 793 f.filename = 794 buildPath(dir, format("%s%.*d%s", filePrefix, fileDigitWidth, i, fileSuffix)); 795 } 796 } 797 798 /* Destructor ensures all files are closed. 799 * 800 * Note: A dual check on whether the file is open is made. This is to avoid a 801 * Phobos bug where std.File doesn't properly maintain the state of open files 802 * if the File.open call fails. See: https://github.com/dlang/phobos/pull/7397. 803 */ 804 ~this() 805 { 806 foreach (ref f; _outputFiles) 807 { 808 if (f.isOpen && f.ofile.isOpen) 809 { 810 assert(_numOpenFiles >= 1); 811 812 f.ofile.close; 813 f.isOpen = false; 814 _numOpenFiles--; 815 } 816 } 817 } 818 819 /* Check if any of the files already exist. 820 * 821 * Returns the empty string if none of the files exist. Otherwise returns the 822 * filename of the first existing file found. This is to facilitate error 823 * message generation. 824 */ 825 string checkIfFilesExist() 826 { 827 foreach (f; _outputFiles) if (f.filename.exists) return f.filename; 828 return ""; 829 } 830 831 /* Picks a random file to close. Used when the open file handle limit has been 832 * reached. 833 */ 834 private void closeSomeFile() 835 { 836 import std.random : uniform; 837 assert(_numOpenFiles > 0); 838 839 immutable uint start = uniform(0, _numFiles); 840 841 foreach (i; cycle(iota(_numFiles), start).take(_numFiles)) 842 { 843 if (_outputFiles[i].isOpen) 844 { 845 _outputFiles[i].ofile.close; 846 _outputFiles[i].isOpen = false; 847 _numOpenFiles--; 848 849 return; 850 } 851 } 852 853 assert(false, "[SplitOutputFiles.closeSomeFile]: Could not find file to close."); 854 } 855 856 /* Write a line to the specified file number. 857 * 858 * A header is written to the file if headers are being written and this is the 859 * first data written to the file. 860 */ 861 void writeDataLine(uint fileNum, const char[] data) 862 { 863 assert(fileNum < _numFiles); 864 assert(fileNum < _outputFiles.length); 865 assert(_numOpenFiles <= _maxOpenFiles); 866 867 OutputFile* outputFile = &_outputFiles[fileNum]; 868 869 if (!outputFile.isOpen) 870 { 871 if (_numOpenFiles == _maxOpenFiles) closeSomeFile(); 872 assert(_numOpenFiles < _maxOpenFiles); 873 874 outputFile.ofile = outputFile.filename.File("a"); 875 outputFile.isOpen = true; 876 _numOpenFiles++; 877 878 if (!outputFile.hasData) 879 { 880 ulong filesize = outputFile.ofile.size; 881 outputFile.hasData = (filesize > 0 && filesize != ulong.max); 882 } 883 } 884 885 if (_writeHeaders && !outputFile.hasData) outputFile.ofile.writeln(_header); 886 887 outputFile.ofile.writeln(data); 888 outputFile.hasData = true; 889 } 890 } 891 892 /** Write input lines to multiple files, randomly selecting an output file for each line. 893 */ 894 void splitLinesRandomly(ref TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles) 895 { 896 import std.random : Random = Mt19937, uniform; 897 import tsv_utils.common.utils : bufferedByLine, InputSourceRange; 898 899 /* inputSources must be an InputSourceRange and include at least stdin. */ 900 assert(!cmdopt.inputSources.empty); 901 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 902 903 auto randomGenerator = Random(cmdopt.seed); 904 905 /* Process each line. */ 906 foreach (inputStream; cmdopt.inputSources) 907 { 908 foreach (line; inputStream.file.bufferedByLine) 909 { 910 immutable uint outputFileNum = uniform(0, cmdopt.numFiles, randomGenerator); 911 outputFiles.writeDataLine(outputFileNum, line); 912 } 913 } 914 } 915 916 /** Write input lines to multiple output files using fields as a random selection key. 917 * 918 * Each input line is written to an output file. The output file is chosen using 919 * fields as a key. Each unique key is assigned to a file. All lines having the 920 * same key are written to the same file. 921 */ 922 void splitLinesByKey(ref TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles) 923 { 924 import std.algorithm : splitter; 925 import std.conv : to; 926 import std.digest.murmurhash; 927 import tsv_utils.common.utils : bufferedByLine, InputFieldReordering, 928 InputSourceRange, throwIfWindowsNewlineOnUnix; 929 930 assert(cmdopt.keyFields.length > 0); 931 932 /* inputSources must be an InputSourceRange and include at least stdin. */ 933 assert(!cmdopt.inputSources.empty); 934 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 935 936 immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys. 937 938 /* Create a mapping for the key fields. */ 939 auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields); 940 941 /* Process each line. */ 942 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 943 foreach (inputStream; cmdopt.inputSources) 944 { 945 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 946 947 foreach (fileLineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine)) 948 { 949 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); 950 951 /* Murmurhash works by successively adding individual keys, then finalizing. 952 * Adding individual keys is simpler if the full-line-as-key and individual 953 * fields as keys cases are separated. 954 */ 955 auto hasher = MurmurHash3!32(cmdopt.seed); 956 957 if (cmdopt.keyIsFullLine) 958 { 959 hasher.put(cast(ubyte[]) line); 960 } 961 else 962 { 963 assert(keyFieldsReordering !is null); 964 965 /* Gather the key field values and assemble the key. */ 966 keyFieldsReordering.initNewLine; 967 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 968 { 969 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 970 if (keyFieldsReordering.allFieldsFilled) break; 971 } 972 973 enforce(keyFieldsReordering.allFieldsFilled, 974 format("Not enough fields in line. File: %s, Line: %s", 975 inputStream.name, fileLineNum)); 976 977 foreach (count, key; keyFieldsReordering.outputFields.enumerate) 978 { 979 if (count > 0) hasher.put(delimArray); 980 hasher.put(cast(ubyte[]) key); 981 } 982 } 983 984 hasher.finish; 985 immutable uint outputFileNum = hasher.get % cmdopt.numFiles; 986 outputFiles.writeDataLine(outputFileNum, line); 987 } 988 } 989 } 990 991 /** Write input lines to multiple files, splitting based on line count. 992 * 993 * Note: readBufferSize is an argument primarily for unit test purposes. Normal uses 994 * should use the default value. 995 */ 996 void splitByLineCount(ref TsvSplitOptions cmdopt, const size_t readBufferSize = 1024L * 128L) 997 { 998 import std.file : exists; 999 import std.path : buildPath; 1000 import std.stdio : File; 1001 import tsv_utils.common.utils : InputSourceRange; 1002 1003 assert (readBufferSize > 0); 1004 ubyte[] readBuffer = new ubyte[readBufferSize]; 1005 1006 /* inputSources must be an InputSourceRange and include at least stdin. */ 1007 assert(!cmdopt.inputSources.empty); 1008 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 1009 1010 string header = !cmdopt.headerInOut ? "" : 1011 cmdopt.inputSources.front.header(Yes.keepTerminator); 1012 size_t nextOutputFileNum = 0; 1013 File outputFile; 1014 string outputFileName; 1015 bool isOutputFileOpen = false; // Open file status tracked separately due to phobos bugs 1016 size_t outputFileRemainingLines; 1017 1018 /* nextNewlineIndex finds the index of the next newline character. It is an 1019 * alternative to std.algorithm.countUntil. Invoking 'find' directly results 1020 * 'memchr' being used (faster). The current 'countUntil' implementation does 1021 * forward to find, but the way it is done avoids the memchr call optimization. 1022 */ 1023 static long nextNewlineIndex(const ubyte[] buffer) 1024 { 1025 import std.algorithm : find; 1026 immutable ubyte newlineChar = '\n'; 1027 immutable size_t buflen = buffer.length; 1028 immutable size_t findlen = buffer.find(newlineChar).length; 1029 1030 return findlen > 0 ? buflen - findlen : -1; 1031 } 1032 1033 foreach (inputStream; cmdopt.inputSources) 1034 { 1035 foreach (ref ubyte[] inputChunk; inputStream.file.byChunk(readBuffer)) 1036 { 1037 size_t nextOutputChunkStart = 0; 1038 auto remainingInputChunk = inputChunk[nextOutputChunkStart .. $]; 1039 1040 while (!remainingInputChunk.empty) 1041 { 1042 /* See if the next output file needs to be opened. */ 1043 if (!isOutputFileOpen) 1044 { 1045 outputFileName = 1046 buildPath(cmdopt.dir, 1047 format("%s%.*d%s", cmdopt.prefix, 1048 cmdopt.digitWidth, nextOutputFileNum, cmdopt.suffix)); 1049 1050 enforce(cmdopt.appendToExistingFiles || !outputFileName.exists, 1051 format("Output file already exists. Use '--a|append' to append to existing files. File: '%s'.", 1052 outputFileName)); 1053 1054 outputFile = outputFileName.File("ab"); 1055 outputFile.setvbuf(1024L * 64L, _IOFBF); 1056 isOutputFileOpen = true; 1057 ++nextOutputFileNum; 1058 outputFileRemainingLines = cmdopt.linesPerFile; 1059 1060 if (cmdopt.headerInOut) 1061 { 1062 ulong filesize = outputFile.size; 1063 if (filesize == 0 || filesize == ulong.max) outputFile.rawWrite(header); 1064 } 1065 } 1066 1067 /* Find more newlines for the current output file. */ 1068 1069 assert(outputFileRemainingLines > 0); 1070 1071 size_t nextOutputChunkEnd = nextOutputChunkStart; 1072 1073 while (outputFileRemainingLines != 0 && !remainingInputChunk.empty) 1074 { 1075 /* Note: newLineIndex is relative to 'remainingInputChunk', not 1076 * 'inputChunk'. Updates to variables referring to 'inputChunk' 1077 * need to reflect this. In particular, 'nextOutputChunkEnd'. 1078 */ 1079 immutable newlineIndex = nextNewlineIndex(remainingInputChunk); 1080 1081 if (newlineIndex == -1) 1082 { 1083 nextOutputChunkEnd = inputChunk.length; 1084 } 1085 else 1086 { 1087 --outputFileRemainingLines; 1088 nextOutputChunkEnd += (newlineIndex + 1); 1089 } 1090 1091 remainingInputChunk = inputChunk[nextOutputChunkEnd .. $]; 1092 } 1093 1094 assert(nextOutputChunkStart < nextOutputChunkEnd); 1095 assert(nextOutputChunkEnd <= inputChunk.length); 1096 1097 outputFile.rawWrite(inputChunk[nextOutputChunkStart .. nextOutputChunkEnd]); 1098 1099 if (outputFileRemainingLines == 0) 1100 { 1101 outputFile.close; 1102 isOutputFileOpen = false; 1103 } 1104 1105 nextOutputChunkStart = nextOutputChunkEnd; 1106 1107 assert(remainingInputChunk.length == inputChunk.length - nextOutputChunkStart); 1108 } 1109 } 1110 } 1111 } 1112 1113 /* splitByLineCount unit tests. 1114 * 1115 * These tests are primarily for buffer management. There are edge cases involving the 1116 * interaction buffer size, input file size, lines-per-file, and newline placement 1117 * that are difficult to test against the executable. 1118 */ 1119 unittest 1120 { 1121 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 1122 import std.algorithm : min; 1123 import std.array : appender; 1124 import std.conv : to; 1125 import std.file : exists, mkdir, rmdirRecurse; 1126 import std.path : buildPath; 1127 import std.process : escapeShellCommand, executeShell; 1128 1129 /* Test setup 1130 * 1131 * A set of twenty file input files is created, with names: input_NxM.txt, where 1132 * N is the number of characters in each row and M is the number of rows (lines). 1133 * The resulting files are put in the "lc_input" directory ('inputDir' variable) 1134 * and have names: 1135 * input_0x2.txt, input_0x3.txt, ... input_5x5.txt. 1136 * 1137 * A standalone block of code produces the expected result files for splitting an 1138 * input file into a set of output files. This duplicates the splitByLineCount 1139 * output. This is done for lines-per-file counts 1 to 5. Each result set is place 1140 * ina subdirectory under "lc_expected" ('expectedDir' variable). Subdirectories 1141 * have names like: "0x2_by_1", "0x3_by_1", ..., "5x5_by_4". 1142 * 1143 * splitByLine is called for all the same input files and lines-per-file settings used 1144 * to produce the expected output. This is done via testSplitByLineCount, which calls 1145 * command line argument processing and splitByLine, similar to how the main program 1146 * works. The results are written to a subdirectory. The subdirectory is compared to 1147 * the expected output directory using the system 'diff' command. 1148 * 1149 * splitByLine is multiple times for each expected output case. The different calls 1150 * iterate over a series of small ReadBufferSizes. This is how tests for edge cases 1151 * in the readBufferSize vs line lengths, newline placement, etc., is accomplished. 1152 * 1153 * Note: One way to understand what is going on is to comment out the line: 1154 * 1155 * scope(exit) testDir.rmdirRecurse; 1156 * 1157 * Then run the test (e.g. 'make test') and look at the directory structure left 1158 * behind. Print out the 'testDir' directory to see where it is located. 1159 */ 1160 1161 /* testSplitByLineCount acts as a surrogate for main() and tsvSplit(). It makes the 1162 * call to splitByLineCount and calls 'diff' to compare the output directory to the 1163 * expected directory. An assert is thrown if the directories do not match. 1164 */ 1165 static void testSplitByLineCount(string[] cmdArgs, string expectedDir, 1166 size_t readBufferSize = 1024L * 512L) 1167 { 1168 import std.array : appender; 1169 1170 assert(cmdArgs.length > 0, "[testSplitByLineCount] cmdArgs must not be empty."); 1171 1172 auto formatAssertMessage(T...)(string msg, T formatArgs) 1173 { 1174 auto formatString = "[testSplitByLineCount] %s: " ~ msg; 1175 return format(formatString, cmdArgs[0], formatArgs); 1176 } 1177 1178 TsvSplitOptions cmdopt; 1179 auto savedCmdArgs = cmdArgs.to!string; 1180 auto r = cmdopt.processArgs(cmdArgs); 1181 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 1182 assert(cmdopt.linesPerFile != 0, "[testSplitByLineCount] --lines-per-file is required."); 1183 assert(!cmdopt.dir.empty, "[testSplitByLineCount] --dir is required."); 1184 1185 splitByLineCount(cmdopt, readBufferSize); 1186 1187 /* Diff command setup. */ 1188 auto diffCmdArgs = ["diff", expectedDir, cmdopt.dir]; 1189 auto diffResult = executeShell(escapeShellCommand(diffCmdArgs)); 1190 assert(diffResult.status == 0, 1191 format("[testSplitByLineCount]\n cmd: %s\n readBufferSize: %d\n expectedDir: %s\n------ Diff ------%s\n-------", 1192 savedCmdArgs, readBufferSize, expectedDir, diffResult.output)); 1193 } 1194 1195 auto testDir = makeUnittestTempDir("tsv_split_bylinecount"); 1196 scope(exit) testDir.rmdirRecurse; 1197 1198 auto inputDir = buildPath(testDir, "lc_input"); 1199 auto outputDir = buildPath(testDir, "lc_output"); 1200 auto expectedDir = buildPath(testDir, "lc_expected"); 1201 1202 mkdir(inputDir); 1203 mkdir(outputDir); 1204 mkdir(expectedDir); 1205 1206 static string buildInputFilePath(string dir, long inputLineLength, long inputFileNumLines) 1207 { 1208 return buildPath(dir, format("input_%dx%d.txt", inputLineLength, inputFileNumLines)); 1209 } 1210 1211 string[5] outputRowData = 1212 [ 1213 "abcde", 1214 "fghij", 1215 "klmno", 1216 "pqrst", 1217 "uvwxy" 1218 ]; 1219 1220 /* The main test loop. Iterates over input line lengths, numbers of rows, 1221 * lines-per-file, and finally readBufferSize lengths. All combos are tested. 1222 */ 1223 foreach (inputLineLength; 0 .. 6) 1224 { 1225 foreach (inputFileNumLines; 2 .. 6) 1226 { 1227 auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines); 1228 1229 { 1230 auto ofile = inputFile.File("w"); 1231 auto output = appender!(char[])(); 1232 foreach (m; 0 .. inputFileNumLines) 1233 { 1234 put(output, outputRowData[m][0 .. inputLineLength]); 1235 put(output, '\n'); 1236 } 1237 ofile.write(output.data); 1238 ofile.close; 1239 } 1240 1241 /* Iterate over the different lines-per-file lengths. 1242 * - Create an expected output directory and files for each. 1243 * - Test with different readBufferSize values. 1244 */ 1245 foreach (outputFileNumLines; 1 .. min(5, inputFileNumLines)) 1246 { 1247 auto expectedSubDir = 1248 buildPath(expectedDir, format("%dx%d_by_%d", inputLineLength, 1249 inputFileNumLines, outputFileNumLines)); 1250 mkdir(expectedSubDir); 1251 1252 size_t filenum = 0; 1253 size_t linesWritten = 0; 1254 while (linesWritten < inputFileNumLines) 1255 { 1256 auto expectedFile = buildPath(expectedSubDir, format("part_%d.txt", filenum)); 1257 auto f = expectedFile.File("w"); 1258 auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten); 1259 foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite]) 1260 { 1261 f.writeln(line[0 .. inputLineLength]); 1262 } 1263 linesWritten += linesToWrite; 1264 ++filenum; 1265 f.close; 1266 } 1267 1268 /* Test the different readBufferSizes. 1269 * - An output directory is created for the run and deleted afterward. 1270 * - First test the default size. 1271 * - Then iterate overs small readBufferSize values. 1272 */ 1273 auto outputSubDir = 1274 buildPath(outputDir, format("%dx%d_by_%d", inputLineLength, 1275 inputFileNumLines, outputFileNumLines)); 1276 mkdir(outputSubDir); 1277 1278 testSplitByLineCount( 1279 ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir, 1280 "--digit-width", "1", inputFile], 1281 expectedSubDir); 1282 1283 outputSubDir.rmdirRecurse; 1284 1285 foreach (readBufSize; 1 .. 8) 1286 { 1287 mkdir(outputSubDir); 1288 1289 testSplitByLineCount( 1290 ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir, 1291 "--digit-width", "1", inputFile], 1292 expectedSubDir, readBufSize); 1293 1294 outputSubDir.rmdirRecurse; 1295 } 1296 } 1297 } 1298 } 1299 1300 { 1301 /* Tests for the special case where readBufferSize is smaller than the header 1302 * line. We'll reuse the input_5x4.txt input file and write 1 line-per-file. 1303 */ 1304 immutable inputLineLength = 5; 1305 immutable inputFileNumLines = 4; 1306 immutable outputFileNumLines = 1; 1307 1308 auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines); 1309 assert(inputFile.exists); 1310 1311 auto expectedSubDirHeader = 1312 buildPath(expectedDir, format("%dx%d_by_%d_header", inputLineLength, 1313 inputFileNumLines, outputFileNumLines)); 1314 1315 auto expectedSubDirHeaderInOnly = 1316 buildPath(expectedDir, format("%dx%d_by_%d_header_in_only", inputLineLength, 1317 inputFileNumLines, outputFileNumLines)); 1318 1319 mkdir(expectedSubDirHeader); 1320 mkdir(expectedSubDirHeaderInOnly); 1321 1322 /* Generate the expected results. Cheat by starting with linesWritten = 1. This 1323 * automatically excludes the header line, but keeps the loop code consistent 1324 * with the main test loop. 1325 */ 1326 size_t filenum = 0; 1327 size_t linesWritten = 1; 1328 while (linesWritten < inputFileNumLines) 1329 { 1330 auto expectedFileHeader = buildPath(expectedSubDirHeader, format("part_%d.txt", filenum)); 1331 auto expectedFileHeaderInOnly = buildPath(expectedSubDirHeaderInOnly, 1332 format("part_%d.txt", filenum)); 1333 auto fHeader = expectedFileHeader.File("w"); 1334 auto fHeaderInOnly = expectedFileHeaderInOnly.File("w"); 1335 auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten); 1336 1337 fHeader.writeln(outputRowData[0][0 .. inputLineLength]); 1338 foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite]) 1339 { 1340 fHeader.writeln(line[0 .. inputLineLength]); 1341 fHeaderInOnly.writeln(line[0 .. inputLineLength]); 1342 } 1343 linesWritten += linesToWrite; 1344 ++filenum; 1345 fHeader.close; 1346 fHeaderInOnly.close; 1347 } 1348 1349 /* Now run the tests. */ 1350 auto outputSubDirHeader = 1351 buildPath(outputDir, format("%dx%d_by_%d_header", inputLineLength, 1352 inputFileNumLines, outputFileNumLines)); 1353 auto outputSubDirHeaderInOnly = 1354 buildPath(outputDir, format("%dx%d_by_%d_header_in_only", inputLineLength, 1355 inputFileNumLines, outputFileNumLines)); 1356 1357 foreach (readBufSize; 1 .. 6) 1358 { 1359 mkdir(outputSubDirHeader); 1360 mkdir(outputSubDirHeaderInOnly); 1361 1362 testSplitByLineCount( 1363 ["test", "--header", "--lines-per-file", outputFileNumLines.to!string, 1364 "--dir", outputSubDirHeader, "--digit-width", "1", inputFile], 1365 expectedSubDirHeader, readBufSize); 1366 1367 testSplitByLineCount( 1368 ["test", "--header-in-only", "--lines-per-file", outputFileNumLines.to!string, 1369 "--dir", outputSubDirHeaderInOnly, "--digit-width", "1", inputFile], 1370 expectedSubDirHeaderInOnly, readBufSize); 1371 1372 outputSubDirHeader.rmdirRecurse; 1373 outputSubDirHeaderInOnly.rmdirRecurse; 1374 } 1375 } 1376 }