1 /** 2 Command line tool for splitting a files (or files) into multiple output files. 3 Several methods for splitting are available, including splitting by line count, 4 splitting by random assignment, and splitting by random assignment based on 5 key fields. 6 7 Copyright (c) 2020, eBay Inc. 8 Initially written by Jon Degenhardt 9 10 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 11 */ 12 module tsv_utils.tsv_split; 13 14 import std.range; 15 import std.stdio; 16 import std.typecons : tuple, Flag; 17 18 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 19 20 version(unittest) 21 { 22 // When running unit tests, use main from -main compiler switch. 23 } 24 else 25 { 26 /** Main program. 27 * 28 * Invokes command line argument processing and calls tsvSplit to do the real 29 * work. Errors occurring during processing are caught and reported to the user. 30 */ 31 int main(string[] cmdArgs) 32 { 33 /* When running in DMD code coverage mode, turn on report merging. */ 34 version(D_Coverage) version(DigitalMars) 35 { 36 import core.runtime : dmd_coverSetMerge; 37 dmd_coverSetMerge(true); 38 } 39 40 TsvSplitOptions cmdopt; 41 const r = cmdopt.processArgs(cmdArgs); 42 if (!r[0]) return r[1]; 43 version(LDC_Profile) 44 { 45 import ldc.profile : resetAll; 46 resetAll(); 47 } 48 try 49 { 50 tsvSplit(cmdopt); 51 } 52 catch (Exception exc) 53 { 54 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 55 return 1; 56 } 57 return 0; 58 } 59 } 60 61 immutable helpText = q"EOS 62 Synopsis: tsv-split [options] [file...] 63 64 Split input lines into multiple output files. There are three modes of 65 operation: 66 67 * Fixed number of lines per file (--l|lines-per-file NUM): Each input 68 block of NUM lines is written to a new file. Similar to Unix 'split'. 69 70 * Random assignment (--n|num-files NUM): Each input line is written to a 71 randomly selected output file. Random selection is from NUM files. 72 73 * Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): 74 Input lines are written to output files using fields as a key. Each 75 unique key is randomly assigned to one of NUM output files. All lines 76 with the same key are written to the same file. 77 78 By default, files are written to the current directory and have names 79 of the form 'part_NNN<suffix>', with 'NNN' being a number and <suffix> 80 being the extension of the first input file. If the input file is 81 'file.txt', the names will take the form 'part_NNN.txt'. The output 82 directory and file names are customizable. 83 84 Use '--help-verbose' for more detailed information. 85 86 Options: 87 EOS"; 88 89 immutable helpTextVerbose = q"EOS 90 Synopsis: tsv-split [options] [file...] 91 92 Split input lines into multiple output files. There are three modes of 93 operation: 94 95 * Fixed number of lines per file (--l|lines-per-file NUM): Each input 96 block of NUM lines is written to a new file. Similar to Unix 'split'. 97 98 * Random assignment (--n|num-files NUM): Each input line is written to a 99 randomly selected output file. Random selection is from NUM files. 100 101 * Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): 102 Input lines are written to output files using fields as a key. Each 103 unique key is randomly assigned to one of NUM output files. All lines 104 with the same key are written to the same file. 105 106 Output files: By default, files are written to the current directory and 107 have names of the form 'part_NNN<suffix>', with 'NNN' being a number and 108 <suffix> being the extension of the first input file. If the input file is 109 'file.txt', the names will take the form 'part_NNN.txt'. The suffix is 110 empty when reading from standard input. The numeric part defaults to 3 111 digits for '--l|lines-per-files'. For '--n|num-files' enough digits are 112 used so all filenames are the same length. The output directory and file 113 names are customizable. 114 115 Header lines: There are two ways to handle input with headers: write a 116 header to all output files (--H|header), or exclude headers from all 117 output files ('--I|header-in-only'). The best choice depends on the 118 follow-up processing. All tsv-utils tools support header lines in multiple 119 input files, but many other tools do not. For example, GNU parallel works 120 best on files without header lines. 121 122 Random assignment (--n|num-files): Random distribution of records to a set 123 of files is a common task. When data fits in memory the preferred approach 124 is usually to shuffle the data and split it into fixed sized blocks. E.g. 125 'tsv-sample data.tsv | tsv-split -l NUM'. However, alternate approaches 126 are needed when data is too large for convenient shuffling. tsv-split's 127 random assignment feature is useful in this case. Each input line is 128 written to a randomly selected output file. Note that output files will 129 have similar but not identical numbers of records. 130 131 Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): This 132 splits a data set into multiple files sharded by key. All lines with the 133 same key are written to the same file. This partitioning enables parallel 134 computation based on the key. For example, statistical calculation 135 ('tsv-summarize --group-by') or duplicate removal ('tsv-uniq --fields'). 136 These operations can be parallelized using tools like GNU parallel, which 137 simplifies concurrent operations on multiple files. 138 139 Random seed: By default, each tsv-split invocation using random assignment 140 or random assignment by key produces different assignments to the output 141 files. Using '--s|static-seed' changes this so multiple runs produce the 142 same assignments. This works by using the same random seed each run. The 143 seed can be specified using '--v|seed-value'. 144 145 Appending to existing files: By default, an error is triggered if an 146 output file already exists. '--a|append' changes this so that lines are 147 appended to existing files. (Header lines are not appended to files with 148 data.) This is useful when adding new data to files created by a previous 149 tsv-split run. Random assignment should use the same '--n|num-files' value 150 each run, but different random seeds (avoid '--s|static-seed'). Random 151 assignment by key should use the same '--n|num-files', '--k|key-fields', 152 and seed ('--s|static-seed' or '--v|seed-value') each run. 153 154 Max number of open files: Random assignment and random assignment by key 155 are dramatically faster when all output files are kept open. However, 156 keeping a large numbers of open files can bump into system limits or limit 157 resources available to other processes. By default, tsv-split uses up to 158 4096 open files or the system per-process limit, whichever is smaller. 159 This can be changed using '--max-open-files', though it cannot be set 160 larger than the system limit. The system limit varies considerably between 161 systems. On many systems it is unlimited. On MacOS it is often set to 256. 162 Use Unix 'ulimit' to display and modify the limits: 163 * 'ulimit -n' - Show the "soft limit". The per-process maximum. 164 * 'ulimit -Hn' - Show the "hard limit". The max allowed soft limit. 165 * 'ulimit -Sn NUM' - Change the "soft limit" to NUM. 166 167 Examples: 168 169 # Split a 10 million line file into 1000 files, 10,000 lines each. 170 # Output files are part_000.txt, part_001.txt, ... part_999.txt. 171 tsv-split data.txt --lines-per-file 10000 172 173 # Same as the previous example, but write files to a subdirectory. 174 tsv-split data.txt --dir split_files --lines-per-file 10000 175 176 # Split a file into 10,000 line files, writing a header line to each 177 tsv-split data.txt -H --lines-per-file 10000 178 179 # Same as the previous example, but dropping the header line. 180 tsv-split data.txt -I --lines-per-file 10000 181 182 # Randomly assign lines to 1000 files 183 tsv-split data.txt --num-files 1000 184 185 # Randomly assign lines to 1000 files while keeping unique keys from 186 # field 3 together. 187 tsv-split data.tsv --num-files 1000 -k 3 188 189 # Randomly assign lines to 1000 files. Later, randomly assign lines 190 # from a second data file to the same output files. 191 tsv-split data1.tsv -n 1000 192 tsv-split data2.tsv -n 1000 --append 193 194 # Randomly assign lines to 1000 files using field 3 as a key. 195 # Later, add a second file to the same output files. 196 tsv-split data1.tsv -n 1000 -k 3 --static-seed 197 tsv-split data2.tsv -n 1000 -k 3 --static-seed --append 198 199 # Change the system per-process open file limit for one command. 200 # The parens create a sub-shell. The current shell is not changed. 201 ( ulimit -Sn 1000 && tsv-split --num-files 1000 data.txt ) 202 203 Options: 204 EOS"; 205 206 /** Container for command line options and derived data. 207 * 208 * TsvSplitOptions handles several aspects of command line options. On the input side, 209 * it defines the command line options available, performs validation, and sets up any 210 * derived state based on the options provided. These activities are handled by the 211 * processArgs() member. 212 * 213 * Once argument processing is complete, TsvSplitOptions is used as a container 214 * holding the specific processing options used by the splitting algorithms. 215 */ 216 struct TsvSplitOptions 217 { 218 enum invalidFileSuffix = "///////"; 219 220 string programName; /// Program name 221 string[] files; /// Input files 222 bool helpVerbose = false; /// --help-verbose 223 bool headerInOut = false; /// --H|header 224 bool headerIn = false; /// --I|header-in-only 225 size_t linesPerFile = 0; /// --l|lines-per-file 226 uint numFiles = 0; /// --n|num-files 227 size_t[] keyFields; /// --k|key-fields 228 string dir; /// --dir 229 string prefix = "part_"; /// --prefix 230 string suffix = invalidFileSuffix; /// --suffix 231 uint digitWidth = 0; /// --w|digit-width 232 bool appendToExistingFiles = false; /// --a|append 233 bool staticSeed = false; /// --s|static-seed 234 uint seedValueOptionArg = 0; /// --v|seed-value 235 char delim = '\t'; /// --d|delimiter 236 uint maxOpenFilesArg = 0; /// --max-open-files 237 bool versionWanted = false; /// --V|version 238 bool hasHeader = false; /// Derived. True if either '--H|header' or '--I|header-in-only' is set. 239 bool keyIsFullLine = false; /// Derived. True if '--f|fields 0' is specfied. 240 bool usingUnpredictableSeed = true; /// Derived from --static-seed, --seed-value 241 uint seed = 0; /// Derived from --static-seed, --seed-value 242 uint maxOpenOutputFiles; /// Derived. 243 244 /** Process tsv-split command line arguments. 245 * 246 * Defines the command line options, performs validation, and derives additional 247 * state. std.getopt.getopt is called to do the main option processing followed 248 * additional validation and derivation. 249 * 250 * Help text is printed to standard output if help was requested. Error text is 251 * written to stderr if invalid input is encountered. 252 * 253 * A tuple is returned. First value is true if command line arguments were 254 * successfully processed and execution should continue, or false if an error 255 * occurred or the user asked for help. If false, the second value is the 256 * appropriate exit code (0 or 1). 257 * 258 * Returning true (execution continues) means args have been validated and derived 259 * values calculated. Field indices will have been converted to zero-based. 260 */ 261 auto processArgs(ref string[] cmdArgs) 262 { 263 import std.algorithm : any, canFind, each, min; 264 import std.file : exists, isDir; 265 import std.format : format; 266 import std.getopt; 267 import std.math : isNaN; 268 import std.path : baseName, expandTilde, extension, stripExtension; 269 import std.typecons : Yes, No; 270 import tsv_utils.common.utils : makeFieldListOptionHandler; 271 272 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 273 274 try 275 { 276 arraySep = ","; // Use comma to separate values in command line options 277 auto r = getopt( 278 cmdArgs, 279 "help-verbose", " Print more detailed help.", &helpVerbose, 280 281 std.getopt.config.caseSensitive, 282 "H|header", " Input files have a header line. Write the header to each output file.", &headerInOut, 283 "I|header-in-only", " Input files have a header line. Do not write the header to output files.", &headerIn, 284 std.getopt.config.caseInsensitive, 285 286 "l|lines-per-file", "NUM Number of lines to write to each output file (excluding the header line).", &linesPerFile, 287 "n|num-files", "NUM Number of output files to generate.", &numFiles, 288 "k|key-fields", "<field-list> Fields to use as key. Lines with the same key are written to the same output file. Use '--k|key-fields 0' to use the entire line as the key.", 289 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero), 290 291 "dir", "STR Directory to write to. Default: Current working directory.", &dir, 292 "prefix", "STR Filename prefix. Default: 'part_'", &prefix, 293 "suffix", "STR Filename suffix. Default: First input file extension. None for standard input.", &suffix, 294 "w|digit-width", "NUM Number of digits in filename numeric portion. Default: '--l|lines-per-file': 3. '--n|num-files': Chosen so filenames have the same length. '--w|digit-width 0' uses the default.", &digitWidth, 295 "a|append", " Append to existing files.", &appendToExistingFiles, 296 297 "s|static-seed", " Use the same random seed every run.", &staticSeed, 298 299 std.getopt.config.caseSensitive, 300 "v|seed-value", "NUM Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg, 301 std.getopt.config.caseInsensitive, 302 303 "d|delimiter", "CHR Field delimiter.", &delim, 304 "max-open-files", "NUM Maximum open file handles to use. Min of 5 required.", &maxOpenFilesArg, 305 306 std.getopt.config.caseSensitive, 307 "V|version", " Print version information and exit.", &versionWanted, 308 std.getopt.config.caseInsensitive, 309 ); 310 311 if (r.helpWanted) 312 { 313 defaultGetoptPrinter(helpText, r.options); 314 return tuple(false, 0); 315 } 316 else if (helpVerbose) 317 { 318 defaultGetoptPrinter(helpTextVerbose, r.options); 319 return tuple(false, 0); 320 } 321 else if (versionWanted) 322 { 323 import tsv_utils.common.tsvutils_version; 324 writeln(tsvutilsVersionNotice("tsv-split")); 325 return tuple(false, 0); 326 } 327 328 /* 329 * Validation and derivations. 330 */ 331 332 if (linesPerFile == 0 && numFiles == 0) 333 { 334 throw new Exception ("Either '--l|lines-per-file' or '--n|num-files' is required."); 335 } 336 337 if (linesPerFile != 0 && numFiles != 0) 338 { 339 throw new Exception ("'--l|lines-per-file' and '--n|num-files' cannot be used together."); 340 } 341 342 if (linesPerFile != 0 && keyFields.length != 0) 343 { 344 throw new Exception ("'--l|lines-per-file' and '--k|key-fields' cannot be used together."); 345 } 346 347 if (numFiles == 1) 348 { 349 throw new Exception("'--n|num-files must be two or more."); 350 } 351 352 if (keyFields.length > 0) 353 { 354 if (keyFields.length == 1 && keyFields[0] == 0) 355 { 356 keyIsFullLine = true; 357 } 358 else 359 { 360 if (keyFields.length > 1 && keyFields.any!(x => x == 0)) 361 { 362 throw new Exception( 363 "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields."); 364 } 365 366 keyFields.each!((ref x) => --x); // Convert to zero-based indexing. 367 } 368 } 369 370 if (headerInOut && headerIn) 371 { 372 throw new Exception("Use only one of '--H|header' and '--I|header-in-only'."); 373 } 374 375 hasHeader = headerInOut || headerIn; 376 377 if (!dir.empty) 378 { 379 dir = dir.expandTilde; 380 if (!dir.exists) throw new Exception(format("Directory does not exist: --dir '%s'", dir)); 381 else if (!dir.isDir) throw new Exception(format("Path is not a directory: --dir '%s'", dir)); 382 } 383 384 /* Seed. */ 385 import std.random : unpredictableSeed; 386 387 usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0); 388 389 if (usingUnpredictableSeed) seed = unpredictableSeed; 390 else if (seedValueOptionArg != 0) seed = seedValueOptionArg; 391 else if (staticSeed) seed = 2438424139; 392 else assert(0, "Internal error, invalid seed option states."); 393 394 /* Maximum number of open files. Mainly applies when --num-files is used. 395 * 396 * Derive maxOpenOutputFiles. Inputs: 397 * - Internal default limit: 4096. This is a somewhat conservative setting. 398 * - rlimit open files limit. Defined by '$ ulimit -n'. 399 * - '--max-open-files' (maxOpenFilesArg). This adjusts the internal limit, 400 * but only up to the rlimit value. 401 * - Four open files are reserved for stdin, stdout, stderr, and one input 402 * file. 403 */ 404 405 immutable uint internalDefaultMaxOpenFiles = 4096; 406 immutable uint numReservedOpenFiles = 4; 407 immutable uint rlimitOpenFilesLimit = rlimitCurrOpenFilesLimit(); 408 409 if (maxOpenFilesArg != 0 && maxOpenFilesArg <= numReservedOpenFiles) 410 { 411 throw new Exception( 412 format("'--max-open-files' must be at least %d.", 413 numReservedOpenFiles + 1)); 414 } 415 416 if (maxOpenFilesArg > rlimitOpenFilesLimit) 417 { 418 throw new Exception( 419 format("'--max-open-files' value (%d) greater current system limit (%d)." ~ 420 "\nRun 'ulimit -n' to see the soft limit." ~ 421 "\nRun 'ulimit -Hn' to see the hard limit." ~ 422 "\nRun 'ulimit -Sn NUM' to change the soft limit.", 423 maxOpenFilesArg, rlimitOpenFilesLimit)); 424 } 425 426 if (rlimitOpenFilesLimit <= numReservedOpenFiles) 427 { 428 throw new Exception( 429 format("System open file limit too small. Current value: %d. Must be %d or more." ~ 430 "\nRun 'ulimit -n' to see the soft limit." ~ 431 "\nRun 'ulimit -Hn' to see the hard limit." ~ 432 "\nRun 'ulimit -Sn NUM' to change the soft limit.", 433 rlimitOpenFilesLimit, numReservedOpenFiles + 1)); 434 } 435 436 immutable uint openFilesLimit = 437 (maxOpenFilesArg != 0) 438 ? maxOpenFilesArg 439 : min(internalDefaultMaxOpenFiles, rlimitOpenFilesLimit); 440 441 assert(openFilesLimit > numReservedOpenFiles); 442 443 maxOpenOutputFiles = openFilesLimit - numReservedOpenFiles; 444 445 /* Remaining command line args. 446 * 447 * Assume remaining args are files. Use standard input if files were not 448 * provided. 449 */ 450 451 files ~= (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 452 cmdArgs.length = 1; 453 454 /* Suffix - If not provided, use the extension of the first input file. 455 * No suffix if reading from standard input. 456 */ 457 if (suffix == invalidFileSuffix) suffix = files[0].extension; 458 459 /* Ensure forward slash is not included in the filename prefix and suffix. 460 * Forward slash is an invalid Unix filename character. However, open file 461 * calls could match a directory path, resulting in unintended file 462 * creation. 463 * 464 * The other invalid filename character on Unix is the NULL character. 465 * However, the NULL character cannot be entered via Unix command lines, 466 * so there is no need to test for it explicitly. 467 */ 468 if (prefix.canFind('/')) 469 { 470 throw new Exception("'--prefix' cannot contain forward slash characters. Use '--dir' to specify an output directory."); 471 } 472 if (suffix.canFind('/')) 473 { 474 throw new Exception("'--suffix' cannot contain forward slash characters. Use '--dir' to specify an output directory."); 475 } 476 477 /* Digit width - If not specified, or specified as zero, the width is 478 * determined by the number of files for --num-files, or defaulted to 3 479 * for --lines-per-file. 480 */ 481 if (digitWidth == 0) 482 { 483 if (numFiles > 0) 484 { 485 digitWidth = 1; 486 uint n = numFiles - 1; 487 while (n >= 10) 488 { 489 n /= 10; 490 ++digitWidth; 491 } 492 } 493 else 494 { 495 digitWidth = 3; 496 } 497 } 498 assert(digitWidth != 0); 499 } 500 catch (Exception exc) 501 { 502 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 503 return tuple(false, 1); 504 } 505 return tuple(true, 0); 506 } 507 } 508 509 /* TsvSplitOptions unit tests (command-line argument processing). 510 * 511 * Basic tests. Many cases are covered in executable tests, including all error cases, 512 * as errors write to stderr. 513 */ 514 unittest 515 { 516 import std.conv : to; 517 import std.format : format; 518 519 { 520 auto args = ["unittest", "--lines-per-file", "10"]; 521 TsvSplitOptions cmdopt; 522 const r = cmdopt.processArgs(args); 523 524 assert(cmdopt.files == ["-"]); 525 assert(cmdopt.linesPerFile == 10); 526 assert(cmdopt.keyFields.empty); 527 assert(cmdopt.numFiles == 0); 528 assert(cmdopt.hasHeader == false); 529 } 530 { 531 auto args = ["unittest", "--num-files", "20"]; 532 TsvSplitOptions cmdopt; 533 const r = cmdopt.processArgs(args); 534 535 assert(cmdopt.files == ["-"]); 536 assert(cmdopt.linesPerFile == 0); 537 assert(cmdopt.keyFields.empty); 538 assert(cmdopt.numFiles == 20); 539 assert(cmdopt.hasHeader == false); 540 } 541 { 542 auto args = ["unittest", "-n", "5", "--key-fields", "1-3"]; 543 TsvSplitOptions cmdopt; 544 const r = cmdopt.processArgs(args); 545 546 assert(cmdopt.linesPerFile == 0); 547 assert(cmdopt.keyFields == [0, 1, 2]); 548 assert(cmdopt.numFiles == 5); 549 assert(cmdopt.hasHeader == false); 550 assert(cmdopt.keyIsFullLine == false); 551 } 552 { 553 auto args = ["unittest", "-n", "5", "-k", "0"]; 554 TsvSplitOptions cmdopt; 555 const r = cmdopt.processArgs(args); 556 557 assert(cmdopt.linesPerFile == 0); 558 assert(cmdopt.numFiles == 5); 559 assert(cmdopt.hasHeader == false); 560 assert(cmdopt.keyIsFullLine == true); 561 } 562 { 563 auto args = ["unittest", "-n", "2", "--header"]; 564 TsvSplitOptions cmdopt; 565 const r = cmdopt.processArgs(args); 566 567 assert(cmdopt.headerInOut == true); 568 assert(cmdopt.hasHeader == true); 569 assert(cmdopt.headerIn == false); 570 } 571 { 572 auto args = ["unittest", "-n", "2", "--header-in-only"]; 573 TsvSplitOptions cmdopt; 574 const r = cmdopt.processArgs(args); 575 576 assert(cmdopt.headerInOut == false); 577 assert(cmdopt.hasHeader == true); 578 assert(cmdopt.headerIn == true); 579 } 580 581 static void testSuffix(string[] args, string expectedSuffix, string[] expectedFiles) 582 { 583 TsvSplitOptions cmdopt; 584 auto savedArgs = args.to!string; 585 const r = cmdopt.processArgs(args); 586 587 assert(r[0], format("[testSuffix] cmdopt.processArgs(%s) returned false.", savedArgs)); 588 assert(cmdopt.suffix == expectedSuffix, 589 format("[testSuffix] Incorrect cmdopt.suffix. Expected: '%s', Actual: '%s'\n cmdopt.processArgs(%s)", 590 expectedSuffix, cmdopt.suffix, savedArgs)); 591 assert(cmdopt.files == expectedFiles, 592 format("[testSuffix] Incorrect cmdopt.files. Expected: %s, Actual: %s\n cmdopt.processArgs(%s)", 593 expectedFiles, cmdopt.files, savedArgs)); 594 } 595 596 testSuffix(["unittest", "-n", "2"], "", ["-"]); 597 testSuffix(["unittest", "-n", "2", "--", "-"], "", ["-"]); 598 testSuffix(["unittest", "-n", "2", "--suffix", "_123"], "_123", ["-"]); 599 testSuffix(["unittest", "-n", "2", "somefile.txt"], ".txt", ["somefile.txt"]); 600 testSuffix(["unittest", "-n", "2", "somefile.txt", "anotherfile.pqr"], 601 ".txt", ["somefile.txt", "anotherfile.pqr"]); 602 testSuffix(["unittest", "-n", "2", "--suffix", ".X", "somefile.txt", "anotherfile.pqr"], 603 ".X", ["somefile.txt", "anotherfile.pqr"]); 604 testSuffix(["unittest", "-n", "2", "--suffix", "", "somefile.txt"], 605 "", ["somefile.txt"]); 606 testSuffix(["unittest", "-n", "2", "--", "-", "somefile.txt"], 607 "", ["-", "somefile.txt"]); 608 testSuffix(["unittest", "-n", "2", "--", "somefile.txt", "-"], 609 ".txt", ["somefile.txt", "-"]); 610 611 static void testDigitWidth(string[] args, uint expected) 612 { 613 TsvSplitOptions cmdopt; 614 auto savedArgs = args.to!string; 615 const r = cmdopt.processArgs(args); 616 617 assert(r[0], format("[testDigitWidth] cmdopt.processArgs(%s) returned false.", savedArgs)); 618 assert(cmdopt.digitWidth == expected, 619 format("[testDigitWidth] Incorrect cmdopt.digitWidth. Expected: %d, Actual: %d\n cmdopt.processArgs(%s)", 620 expected, cmdopt.digitWidth, savedArgs)); 621 } 622 623 testDigitWidth(["unittest", "-n", "2"], 1); 624 testDigitWidth(["unittest", "-n", "2", "--digit-width" , "0"], 1); 625 testDigitWidth(["unittest", "-n", "10"], 1); 626 testDigitWidth(["unittest", "-n", "11"], 2); 627 testDigitWidth(["unittest", "-n", "555"], 3); 628 testDigitWidth(["unittest", "-n", "555", "--digit-width" , "2"], 2); 629 testDigitWidth(["unittest", "-n", "555", "--digit-width" , "4"], 4); 630 testDigitWidth(["unittest", "-l", "10"], 3); 631 testDigitWidth(["unittest", "-l", "10000"], 3); 632 testDigitWidth(["unittest", "-l", "10000", "--digit-width", "0"], 3); 633 testDigitWidth(["unittest", "-l", "10000", "--digit-width", "1"], 1); 634 testDigitWidth(["unittest", "-l", "10000", "--digit-width", "5"], 5); 635 } 636 637 /** Get the rlimit current number of open files the process is allowed. 638 * 639 * This routine returns the current soft limit on the number of open files the process 640 * is allowed. This is the number returned by the command: '$ ulimit -n'. 641 * 642 * This routine translates this value to a 'uint', as tsv-split uses 'uint' for 643 * tracking output files. The rlimit 'rlim_t' type is usually 'ulong' or 'long'. 644 * RLIM_INFINITY and any value larger than 'uint.max' is translated to 'uint.max'. 645 * 646 * An exception is thrown if call to 'getrlimit' fails. 647 */ 648 uint rlimitCurrOpenFilesLimit() 649 { 650 import core.sys.posix.sys.resource : 651 rlim_t, rlimit, getrlimit, RLIMIT_NOFILE, RLIM_INFINITY, RLIM_SAVED_CUR; 652 import std.conv : to; 653 654 uint currOpenFileLimit = uint.max; 655 656 rlimit rlimitMaxOpenFiles; 657 658 if (getrlimit(RLIMIT_NOFILE, &rlimitMaxOpenFiles) != 0) 659 { 660 throw new Exception("Internal error: getrlimit call failed"); 661 } 662 663 if (rlimitMaxOpenFiles.rlim_cur != RLIM_INFINITY && 664 rlimitMaxOpenFiles.rlim_cur != RLIM_SAVED_CUR && 665 rlimitMaxOpenFiles.rlim_cur >= 0 && 666 rlimitMaxOpenFiles.rlim_cur <= uint.max) 667 { 668 currOpenFileLimit = rlimitMaxOpenFiles.rlim_cur.to!uint; 669 } 670 671 return currOpenFileLimit; 672 } 673 674 /** Invokes the proper split routine based on the command line arguments. 675 * 676 * This routine is the top-level control after command line argument processing is 677 * done. It's primary job is to set up data structures and invoke the correct 678 * processing routine based on the command line arguments. 679 */ 680 void tsvSplit(TsvSplitOptions cmdopt) 681 { 682 import std.format : format; 683 684 if (cmdopt.linesPerFile != 0) 685 { 686 splitByLineCount(cmdopt); 687 } 688 else 689 { 690 /* Randomly distribute input lines to a specified number of files. */ 691 692 auto outputFiles = 693 SplitOutputFiles(cmdopt.numFiles, cmdopt.dir, cmdopt.prefix, cmdopt.suffix, 694 cmdopt.digitWidth, cmdopt.headerInOut, cmdopt.maxOpenOutputFiles); 695 696 if (!cmdopt.appendToExistingFiles) 697 { 698 string existingFile = outputFiles.checkIfFilesExist; 699 700 if (existingFile.length != 0) 701 { 702 throw new Exception( 703 format("One or more output files already exist. Use '--a|append' to append to existing files. File: '%s'.", 704 existingFile)); 705 } 706 } 707 708 if (cmdopt.keyFields.length == 0) 709 { 710 splitLinesRandomly(cmdopt, outputFiles); 711 } 712 else 713 { 714 splitLinesByKey(cmdopt, outputFiles); 715 } 716 } 717 } 718 719 /** A SplitOutputFiles struct holds a collection of output files. 720 * 721 * This struct manages a collection of output files used when writing to multiple 722 * files at once. This includes constructing filenames, opening and closing files, 723 * and writing data and header lines. 724 * 725 * Both random assignment (splitLinesRandomly) and random assignment by key 726 * (splitLinesByKey) use a SplitOutputFiles struct to manage output files. 727 * 728 * The main properties of the output file set are specified in the constuctor. The 729 * exception is the header line. This is not known until the first input file is 730 * read, so it is specified in a separate 'setHeader' call. 731 * 732 * Individual output files are written to based on their zero-based index in the 733 * output collection. The caller selects the output file number to write to and 734 * calls 'writeDataLine' to write a line. The header is written if needed. 735 */ 736 struct SplitOutputFiles 737 { 738 import std.conv : to; 739 import std.file : exists; 740 import std.format : format; 741 import std.path : buildPath; 742 import std.stdio : File; 743 744 static struct OutputFile 745 { 746 string filename; 747 File ofile; 748 bool hasData; 749 bool isOpen; // Track separately due to https://github.com/dlang/phobos/pull/7397 750 } 751 752 private uint _numFiles; 753 private bool _writeHeaders; 754 private uint _maxOpenFiles; 755 756 private OutputFile[] _outputFiles; 757 private uint _numOpenFiles = 0; 758 private string _header; 759 760 this(uint numFiles, string dir, string filePrefix, string fileSuffix, 761 uint fileDigitWidth, bool writeHeaders, uint maxOpenFiles) 762 { 763 assert(numFiles >= 2); 764 assert(maxOpenFiles >= 1); 765 766 _numFiles = numFiles; 767 _writeHeaders = writeHeaders; 768 _maxOpenFiles = maxOpenFiles; 769 770 _outputFiles.length = numFiles; 771 772 /* Filename assignment. */ 773 foreach (i, ref f; _outputFiles) 774 { 775 f.filename = 776 buildPath(dir, format("%s%.*d%s", filePrefix, fileDigitWidth, i, fileSuffix)); 777 } 778 } 779 780 /* Destructor ensures all files are closed. 781 * 782 * Note: A dual check on whether the file is open is made. This is to avoid a 783 * Phobos bug where std.File doesn't properly maintain the state of open files 784 * if the File.open call fails. See: https://github.com/dlang/phobos/pull/7397. 785 */ 786 ~this() 787 { 788 foreach (ref f; _outputFiles) 789 { 790 if (f.isOpen && f.ofile.isOpen) 791 { 792 assert(_numOpenFiles >= 1); 793 794 f.ofile.close; 795 f.isOpen = false; 796 _numOpenFiles--; 797 } 798 } 799 } 800 801 /* Check if any of the files already exist. 802 * 803 * Returns the empty string if none of the files exist. Otherwise returns the 804 * filename of the first existing file found. This is to facilitate error 805 * message generation. 806 */ 807 string checkIfFilesExist() 808 { 809 foreach (f; _outputFiles) if (f.filename.exists) return f.filename; 810 return ""; 811 } 812 813 /* Sets the header line. 814 * 815 * Should be called prior to writeDataLine when headers are being written. This 816 * method is separate from the constructor because the header is not available 817 * until the first line of a file is read. 818 * 819 * Headers are only written if 'writeHeaders' is specified as true in the 820 * constructor. As a convenience, this routine can be called even if headers are 821 * not being written. 822 */ 823 void setHeader(const char[] header) 824 { 825 _header = header.to!string; 826 } 827 828 /* Picks a random file to close. Used when the open file handle limit has been 829 * reached. 830 */ 831 private void closeSomeFile() 832 { 833 import std.random : uniform; 834 assert(_numOpenFiles > 0); 835 836 immutable uint start = uniform(0, _numFiles); 837 838 foreach (i; cycle(iota(_numFiles), start).take(_numFiles)) 839 { 840 if (_outputFiles[i].isOpen) 841 { 842 _outputFiles[i].ofile.close; 843 _outputFiles[i].isOpen = false; 844 _numOpenFiles--; 845 846 return; 847 } 848 } 849 850 assert(false, "[SplitOutputFiles.closeSomeFile]: Could not find file to close."); 851 } 852 853 /* Write a line to the specified file number. 854 * 855 * A header is written to the file if headers are being written and this is the 856 * first data written to the file. 857 */ 858 void writeDataLine(uint fileNum, const char[] data) 859 { 860 assert(fileNum < _numFiles); 861 assert(fileNum < _outputFiles.length); 862 assert(_numOpenFiles <= _maxOpenFiles); 863 864 OutputFile* outputFile = &_outputFiles[fileNum]; 865 866 if (!outputFile.isOpen) 867 { 868 if (_numOpenFiles == _maxOpenFiles) closeSomeFile(); 869 assert(_numOpenFiles < _maxOpenFiles); 870 871 outputFile.ofile = outputFile.filename.File("a"); 872 outputFile.isOpen = true; 873 _numOpenFiles++; 874 875 if (!outputFile.hasData) 876 { 877 ulong filesize = outputFile.ofile.size; 878 outputFile.hasData = (filesize > 0 && filesize != ulong.max); 879 } 880 } 881 882 if (_writeHeaders && !outputFile.hasData) outputFile.ofile.writeln(_header); 883 884 outputFile.ofile.writeln(data); 885 outputFile.hasData = true; 886 } 887 } 888 889 /** Write input lines to multiple files, randomly selecting an output file for each line. 890 */ 891 void splitLinesRandomly(TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles) 892 { 893 import std.random : Random = Mt19937, uniform; 894 import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix; 895 896 auto randomGenerator = Random(cmdopt.seed); 897 898 /* Process each line. */ 899 foreach (inputFileNum, filename; cmdopt.files) 900 { 901 auto inputStream = (filename == "-") ? stdin : filename.File(); 902 foreach (ulong fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) 903 { 904 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 905 if (fileLineNum == 1 && cmdopt.hasHeader) 906 { 907 if (inputFileNum == 0) outputFiles.setHeader(line); 908 } 909 else 910 { 911 immutable uint outputFileNum = uniform(0, cmdopt.numFiles, randomGenerator); 912 outputFiles.writeDataLine(outputFileNum, line); 913 } 914 } 915 916 /* Close input files immediately after use to preserve open file handles. 917 * File close occurs when variable goes out scope, but not immediately in the 918 * case of loop termination. Avoids open file errors when the number of 919 * output files exceeds the open file limit. 920 */ 921 if (filename != "-") inputStream.close; 922 } 923 } 924 925 /** Write input lines to multiple output files using fields as a random selection key. 926 * 927 * Each input line is written to an output file. The output file is chosen using 928 * fields as a key. Each unique key is assigned to a file. All lines having the 929 * same key are written to the same file. 930 */ 931 void splitLinesByKey(TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles) 932 { 933 import std.algorithm : splitter; 934 import std.conv : to; 935 import std.digest.murmurhash; 936 import tsv_utils.common.utils : bufferedByLine, InputFieldReordering, throwIfWindowsNewlineOnUnix; 937 938 assert(cmdopt.keyFields.length > 0); 939 940 immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys. 941 942 /* Create a mapping for the key fields. */ 943 auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields); 944 945 /* Process each line. */ 946 foreach (inputFileNum, filename; cmdopt.files) 947 { 948 auto inputStream = (filename == "-") ? stdin : filename.File(); 949 foreach (ulong fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) 950 { 951 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 952 if (fileLineNum == 1 && cmdopt.hasHeader) 953 { 954 if (inputFileNum == 0) outputFiles.setHeader(line); 955 } 956 else 957 { 958 /* Murmurhash works by successively adding individual keys, then finalizing. 959 * Adding individual keys is simpler if the full-line-as-key and individual 960 * fields as keys cases are separated. 961 */ 962 auto hasher = MurmurHash3!32(cmdopt.seed); 963 964 if (cmdopt.keyIsFullLine) 965 { 966 hasher.put(cast(ubyte[]) line); 967 } 968 else 969 { 970 assert(keyFieldsReordering !is null); 971 972 /* Gather the key field values and assemble the key. */ 973 keyFieldsReordering.initNewLine; 974 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 975 { 976 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 977 if (keyFieldsReordering.allFieldsFilled) break; 978 } 979 980 if (!keyFieldsReordering.allFieldsFilled) 981 { 982 import std.format : format; 983 throw new Exception( 984 format("Not enough fields in line. File: %s, Line: %s", 985 (filename == "-") ? "Standard Input" : filename, fileLineNum)); 986 } 987 988 foreach (count, key; keyFieldsReordering.outputFields.enumerate) 989 { 990 if (count > 0) hasher.put(delimArray); 991 hasher.put(cast(ubyte[]) key); 992 } 993 } 994 995 hasher.finish; 996 immutable uint outputFileNum = hasher.get % cmdopt.numFiles; 997 outputFiles.writeDataLine(outputFileNum, line); 998 } 999 } 1000 1001 /* Close input files immediately after use to preserve open file handles. 1002 * File close occurs when variable goes out scope, but not immediately in the 1003 * case of loop termination. Avoids open file errors when the number of 1004 * output files exceeds the open file limit. 1005 */ 1006 if (filename != "-") inputStream.close; 1007 } 1008 } 1009 1010 /** Write input lines to multiple files, splitting based on line count. 1011 * 1012 * Note: readBufferSize is an argument primarily for unit test purposes. Normal uses 1013 * should use the default value. 1014 */ 1015 void splitByLineCount(TsvSplitOptions cmdopt, const size_t readBufferSize = 1024L * 512L) 1016 { 1017 import std.array : appender; 1018 import std.file : exists; 1019 import std.format : format; 1020 import std.path : buildPath; 1021 import std.stdio : File; 1022 1023 assert (readBufferSize > 0); 1024 ubyte[] readBuffer = new ubyte[readBufferSize]; 1025 1026 auto header = appender!(ubyte[])(); 1027 bool headerSaved = !cmdopt.headerInOut; // True if 'header' has been saved, or does not need to be. 1028 size_t nextOutputFileNum = 0; 1029 File outputFile; 1030 string outputFileName; 1031 bool isOutputFileOpen = false; // Open file status tracked separately due to phobos bugs 1032 size_t outputFileRemainingLines; 1033 1034 /* nextNewlineIndex finds the index of the next newline character. It is an 1035 * alternative to std.algorithm.countUntil. Invoking 'find' directly results 1036 * 'memchr' being used (faster). The current 'countUntil' implementation does 1037 * forward to find, but the way it is done avoids the memchr call optimization. 1038 */ 1039 static long nextNewlineIndex(const ubyte[] buffer) 1040 { 1041 import std.algorithm : find; 1042 immutable ubyte newlineChar = '\n'; 1043 immutable size_t buflen = buffer.length; 1044 immutable size_t findlen = buffer.find(newlineChar).length; 1045 1046 return findlen > 0 ? buflen - findlen : -1; 1047 } 1048 1049 foreach (filename; cmdopt.files) 1050 { 1051 auto inputStream = (filename == "-") ? stdin : filename.File("rb"); 1052 bool isReadingHeader = cmdopt.hasHeader; 1053 1054 foreach (ref ubyte[] inputChunk; inputStream.byChunk(readBuffer)) 1055 { 1056 size_t nextOutputChunkStart = 0; 1057 1058 if (isReadingHeader) 1059 { 1060 immutable newlineIndex = nextNewlineIndex(inputChunk); 1061 1062 if (newlineIndex == -1) 1063 { 1064 /* Rare case - Header line longer than read buffer. Keep reading 1065 * the header. 1066 */ 1067 if (!headerSaved) put(header, inputChunk); 1068 continue; 1069 } 1070 else 1071 { 1072 if (!headerSaved) 1073 { 1074 put(header, inputChunk[0 .. newlineIndex + 1]); 1075 headerSaved = true; 1076 } 1077 isReadingHeader = false; 1078 nextOutputChunkStart = newlineIndex + 1; 1079 } 1080 } 1081 1082 /* Done with the header. Process the rest of the inputChunk. */ 1083 1084 auto remainingInputChunk = inputChunk[nextOutputChunkStart .. $]; 1085 1086 while (!remainingInputChunk.empty) 1087 { 1088 /* See if the next output file needs to be opened. */ 1089 if (!isOutputFileOpen) 1090 { 1091 outputFileName = 1092 buildPath(cmdopt.dir, 1093 format("%s%.*d%s", cmdopt.prefix, 1094 cmdopt.digitWidth, nextOutputFileNum, cmdopt.suffix)); 1095 1096 if (!cmdopt.appendToExistingFiles && outputFileName.exists) 1097 { 1098 throw new Exception( 1099 format("Output file already exists. Use '--a|append' to append to existing files. File: '%s'.", 1100 outputFileName)); 1101 } 1102 1103 outputFile = outputFileName.File("ab"); 1104 isOutputFileOpen = true; 1105 ++nextOutputFileNum; 1106 outputFileRemainingLines = cmdopt.linesPerFile; 1107 1108 assert(headerSaved); 1109 1110 if (cmdopt.headerInOut) 1111 { 1112 ulong filesize = outputFile.size; 1113 if (filesize == 0 || filesize == ulong.max) outputFile.rawWrite(header.data); 1114 } 1115 } 1116 1117 /* Find more newlines for the current output file. */ 1118 1119 assert(outputFileRemainingLines > 0); 1120 1121 size_t nextOutputChunkEnd = nextOutputChunkStart; 1122 1123 while (outputFileRemainingLines != 0 && !remainingInputChunk.empty) 1124 { 1125 /* Note: newLineIndex is relative to 'remainingInputChunk', not 1126 * 'inputChunk'. Updates to variables referring to 'inputChunk' 1127 * need to reflect this. In particular, 'nextOutputChunkEnd'. 1128 */ 1129 immutable newlineIndex = nextNewlineIndex(remainingInputChunk); 1130 1131 if (newlineIndex == -1) 1132 { 1133 nextOutputChunkEnd = inputChunk.length; 1134 } 1135 else 1136 { 1137 --outputFileRemainingLines; 1138 nextOutputChunkEnd += (newlineIndex + 1); 1139 } 1140 1141 remainingInputChunk = inputChunk[nextOutputChunkEnd .. $]; 1142 } 1143 1144 assert(nextOutputChunkStart < nextOutputChunkEnd); 1145 assert(nextOutputChunkEnd <= inputChunk.length); 1146 1147 outputFile.rawWrite(inputChunk[nextOutputChunkStart .. nextOutputChunkEnd]); 1148 1149 if (outputFileRemainingLines == 0) 1150 { 1151 outputFile.close; 1152 isOutputFileOpen = false; 1153 } 1154 1155 nextOutputChunkStart = nextOutputChunkEnd; 1156 1157 assert(remainingInputChunk.length == inputChunk.length - nextOutputChunkStart); 1158 } 1159 } 1160 } 1161 } 1162 1163 /* splitByLineCount unit tests. 1164 * 1165 * These tests are primarily for buffer management. There are edge cases involving the 1166 * interaction buffer size, input file size, lines-per-file, and newline placement 1167 * that are difficult to test against the executable. 1168 */ 1169 unittest 1170 { 1171 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 1172 import std.algorithm : min; 1173 import std.array : appender; 1174 import std.conv : to; 1175 import std.file : exists, mkdir, rmdirRecurse; 1176 import std.format : format; 1177 import std.path : buildPath; 1178 import std.process : escapeShellCommand, executeShell; 1179 1180 /* Test setup 1181 * 1182 * A set of twenty file input files is created, with names: input_NxM.txt, where 1183 * N is the number of characters in each row and M is the number of rows (lines). 1184 * The resulting files are put in the "lc_input" directory ('inputDir' variable) 1185 * and have names: 1186 * input_0x2.txt, input_0x3.txt, ... input_5x5.txt. 1187 * 1188 * A standalone block of code produces the expected result files for splitting an 1189 * input file into a set of output files. This duplicates the splitByLineCount 1190 * output. This is done for lines-per-file counts 1 to 5. Each result set is place 1191 * ina subdirectory under "lc_expected" ('expectedDir' variable). Subdirectories 1192 * have names like: "0x2_by_1", "0x3_by_1", ..., "5x5_by_4". 1193 * 1194 * splitByLine is called for all the same input files and lines-per-file settings used 1195 * to produce the expected output. This is done via testSplitByLineCount, which calls 1196 * command line argument processing and splitByLine, similar to how the main program 1197 * works. The results are written to a subdirectory. The subdirectory is compared to 1198 * the expected output directory using the system 'diff' command. 1199 * 1200 * splitByLine is multiple times for each expected output case. The different calls 1201 * iterate over a series of small ReadBufferSizes. This is how tests for edge cases 1202 * in the readBufferSize vs line lengths, newline placement, etc., is accomplished. 1203 * 1204 * Note: One way to understand what is going on is to comment out the line: 1205 * 1206 * scope(exit) testDir.rmdirRecurse; 1207 * 1208 * Then run the test (e.g. 'make test') and look at the directory structure left 1209 * behind. Print out the 'testDir' directory to see where it is located. 1210 */ 1211 1212 /* testSplitByLineCount acts as a surrogate for main() and tsvSplit(). It makes the 1213 * call to splitByLineCount and calls 'diff' to compare the output directory to the 1214 * expected directory. An assert is thrown if the directories do not match. 1215 */ 1216 static void testSplitByLineCount(string[] cmdArgs, string expectedDir, 1217 size_t readBufferSize = 1024L * 512L) 1218 { 1219 import std.array : appender; 1220 import std.format : format; 1221 1222 assert(cmdArgs.length > 0, "[testSplitByLineCount] cmdArgs must not be empty."); 1223 1224 auto formatAssertMessage(T...)(string msg, T formatArgs) 1225 { 1226 auto formatString = "[testSplitByLineCount] %s: " ~ msg; 1227 return format(formatString, cmdArgs[0], formatArgs); 1228 } 1229 1230 TsvSplitOptions cmdopt; 1231 auto savedCmdArgs = cmdArgs.to!string; 1232 auto r = cmdopt.processArgs(cmdArgs); 1233 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 1234 assert(cmdopt.linesPerFile != 0, "[testSplitByLineCount] --lines-per-file is required."); 1235 assert(!cmdopt.dir.empty, "[testSplitByLineCount] --dir is required."); 1236 1237 splitByLineCount(cmdopt, readBufferSize); 1238 1239 /* Diff command setup. */ 1240 auto diffCmdArgs = ["diff", expectedDir, cmdopt.dir]; 1241 auto diffResult = executeShell(escapeShellCommand(diffCmdArgs)); 1242 assert(diffResult.status == 0, 1243 format("[testSplitByLineCount]\n cmd: %s\n readBufferSize: %d\n expectedDir: %s\n------ Diff ------%s\n-------", 1244 savedCmdArgs, readBufferSize, expectedDir, diffResult.output)); 1245 } 1246 1247 auto testDir = makeUnittestTempDir("tsv_split"); 1248 scope(exit) testDir.rmdirRecurse; 1249 1250 auto inputDir = buildPath(testDir, "lc_input"); 1251 auto outputDir = buildPath(testDir, "lc_output"); 1252 auto expectedDir = buildPath(testDir, "lc_expected"); 1253 1254 mkdir(inputDir); 1255 mkdir(outputDir); 1256 mkdir(expectedDir); 1257 1258 static string buildInputFilePath(string dir, long inputLineLength, long inputFileNumLines) 1259 { 1260 return buildPath(dir, format("input_%dx%d.txt", inputLineLength, inputFileNumLines)); 1261 } 1262 1263 string[5] outputRowData = 1264 [ 1265 "abcde", 1266 "fghij", 1267 "klmno", 1268 "pqrst", 1269 "uvwxy" 1270 ]; 1271 1272 /* The main test loop. Iterates over input line lengths, numbers of rows, 1273 * lines-per-file, and finally readBufferSize lengths. All combos are tested. 1274 */ 1275 foreach (inputLineLength; 0 .. 6) 1276 { 1277 foreach (inputFileNumLines; 2 .. 6) 1278 { 1279 auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines); 1280 1281 { 1282 auto ofile = inputFile.File("w"); 1283 auto output = appender!(char[])(); 1284 foreach (m; 0 .. inputFileNumLines) 1285 { 1286 put(output, outputRowData[m][0 .. inputLineLength]); 1287 put(output, '\n'); 1288 } 1289 ofile.write(output.data); 1290 ofile.close; 1291 } 1292 1293 /* Iterate over the different lines-per-file lengths. 1294 * - Create an expected output directory and files for each. 1295 * - Test with different readBufferSize values. 1296 */ 1297 foreach (outputFileNumLines; 1 .. min(5, inputFileNumLines)) 1298 { 1299 auto expectedSubDir = 1300 buildPath(expectedDir, format("%dx%d_by_%d", inputLineLength, 1301 inputFileNumLines, outputFileNumLines)); 1302 mkdir(expectedSubDir); 1303 1304 size_t filenum = 0; 1305 size_t linesWritten = 0; 1306 while (linesWritten < inputFileNumLines) 1307 { 1308 auto expectedFile = buildPath(expectedSubDir, format("part_%d.txt", filenum)); 1309 auto f = expectedFile.File("w"); 1310 auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten); 1311 foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite]) 1312 { 1313 f.writeln(line[0 .. inputLineLength]); 1314 } 1315 linesWritten += linesToWrite; 1316 ++filenum; 1317 f.close; 1318 } 1319 1320 /* Test the different readBufferSizes. 1321 * - An output directory is created for the run and deleted afterward. 1322 * - First test the default size. 1323 * - Then iterate overs small readBufferSize values. 1324 */ 1325 auto outputSubDir = 1326 buildPath(outputDir, format("%dx%d_by_%d", inputLineLength, 1327 inputFileNumLines, outputFileNumLines)); 1328 mkdir(outputSubDir); 1329 1330 testSplitByLineCount( 1331 ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir, 1332 "--digit-width", "1", inputFile], 1333 expectedSubDir); 1334 1335 outputSubDir.rmdirRecurse; 1336 1337 foreach (readBufSize; 1 .. 8) 1338 { 1339 mkdir(outputSubDir); 1340 1341 testSplitByLineCount( 1342 ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir, 1343 "--digit-width", "1", inputFile], 1344 expectedSubDir, readBufSize); 1345 1346 outputSubDir.rmdirRecurse; 1347 } 1348 } 1349 } 1350 } 1351 1352 { 1353 /* Tests for the special case where readBufferSize is smaller than the header 1354 * line. We'll reuse the input_5x4.txt input file and write 1 line-per-file. 1355 */ 1356 immutable inputLineLength = 5; 1357 immutable inputFileNumLines = 4; 1358 immutable outputFileNumLines = 1; 1359 1360 auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines); 1361 assert(inputFile.exists); 1362 1363 auto expectedSubDirHeader = 1364 buildPath(expectedDir, format("%dx%d_by_%d_header", inputLineLength, 1365 inputFileNumLines, outputFileNumLines)); 1366 1367 auto expectedSubDirHeaderInOnly = 1368 buildPath(expectedDir, format("%dx%d_by_%d_header_in_only", inputLineLength, 1369 inputFileNumLines, outputFileNumLines)); 1370 1371 mkdir(expectedSubDirHeader); 1372 mkdir(expectedSubDirHeaderInOnly); 1373 1374 /* Generate the expected results. Cheat by starting with linesWritten = 1. This 1375 * automatically excludes the header line, but keeps the loop code consistent 1376 * with the main test loop. 1377 */ 1378 size_t filenum = 0; 1379 size_t linesWritten = 1; 1380 while (linesWritten < inputFileNumLines) 1381 { 1382 auto expectedFileHeader = buildPath(expectedSubDirHeader, format("part_%d.txt", filenum)); 1383 auto expectedFileHeaderInOnly = buildPath(expectedSubDirHeaderInOnly, 1384 format("part_%d.txt", filenum)); 1385 auto fHeader = expectedFileHeader.File("w"); 1386 auto fHeaderInOnly = expectedFileHeaderInOnly.File("w"); 1387 auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten); 1388 1389 fHeader.writeln(outputRowData[0][0 .. inputLineLength]); 1390 foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite]) 1391 { 1392 fHeader.writeln(line[0 .. inputLineLength]); 1393 fHeaderInOnly.writeln(line[0 .. inputLineLength]); 1394 } 1395 linesWritten += linesToWrite; 1396 ++filenum; 1397 fHeader.close; 1398 fHeaderInOnly.close; 1399 } 1400 1401 /* Now run the tests. */ 1402 auto outputSubDirHeader = 1403 buildPath(outputDir, format("%dx%d_by_%d_header", inputLineLength, 1404 inputFileNumLines, outputFileNumLines)); 1405 auto outputSubDirHeaderInOnly = 1406 buildPath(outputDir, format("%dx%d_by_%d_header_in_only", inputLineLength, 1407 inputFileNumLines, outputFileNumLines)); 1408 1409 foreach (readBufSize; 1 .. 6) 1410 { 1411 mkdir(outputSubDirHeader); 1412 mkdir(outputSubDirHeaderInOnly); 1413 1414 testSplitByLineCount( 1415 ["test", "--header", "--lines-per-file", outputFileNumLines.to!string, 1416 "--dir", outputSubDirHeader, "--digit-width", "1", inputFile], 1417 expectedSubDirHeader, readBufSize); 1418 1419 testSplitByLineCount( 1420 ["test", "--header-in-only", "--lines-per-file", outputFileNumLines.to!string, 1421 "--dir", outputSubDirHeaderInOnly, "--digit-width", "1", inputFile], 1422 expectedSubDirHeaderInOnly, readBufSize); 1423 1424 outputSubDirHeader.rmdirRecurse; 1425 outputSubDirHeaderInOnly.rmdirRecurse; 1426 } 1427 } 1428 }