1 /** 2 Command line tool for splitting a files (or files) into multiple output files. 3 Several methods for splitting are available, including splitting by line count, 4 splitting by random assignment, and splitting by random assignment based on 5 key fields. 6 7 Copyright (c) 2020, eBay Inc. 8 Initially written by Jon Degenhardt 9 10 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 11 */ 12 module tsv_utils.tsv_split; 13 14 import std.exception : enforce; 15 import std.format : format; 16 import std.range; 17 import std.stdio; 18 import std.typecons : tuple, Flag; 19 20 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 21 22 version(unittest) 23 { 24 // When running unit tests, use main from -main compiler switch. 25 } 26 else 27 { 28 /** Main program. 29 * 30 * Invokes command line argument processing and calls tsvSplit to do the real 31 * work. Errors occurring during processing are caught and reported to the user. 32 */ 33 int main(string[] cmdArgs) 34 { 35 /* When running in DMD code coverage mode, turn on report merging. */ 36 version(D_Coverage) version(DigitalMars) 37 { 38 import core.runtime : dmd_coverSetMerge; 39 dmd_coverSetMerge(true); 40 } 41 42 TsvSplitOptions cmdopt; 43 const r = cmdopt.processArgs(cmdArgs); 44 if (!r[0]) return r[1]; 45 version(LDC_Profile) 46 { 47 import ldc.profile : resetAll; 48 resetAll(); 49 } 50 try 51 { 52 tsvSplit(cmdopt); 53 } 54 catch (Exception exc) 55 { 56 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 57 return 1; 58 } 59 return 0; 60 } 61 } 62 63 immutable helpText = q"EOS 64 Synopsis: tsv-split [options] [file...] 65 66 Split input lines into multiple output files. There are three modes of 67 operation: 68 69 * Fixed number of lines per file (--l|lines-per-file NUM): Each input 70 block of NUM lines is written to a new file. Similar to Unix 'split'. 71 72 * Random assignment (--n|num-files NUM): Each input line is written to a 73 randomly selected output file. Random selection is from NUM files. 74 75 * Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): 76 Input lines are written to output files using fields as a key. Each 77 unique key is randomly assigned to one of NUM output files. All lines 78 with the same key are written to the same file. 79 80 By default, files are written to the current directory and have names 81 of the form 'part_NNN<suffix>', with 'NNN' being a number and <suffix> 82 being the extension of the first input file. If the input file is 83 'file.txt', the names will take the form 'part_NNN.txt'. The output 84 directory and file names are customizable. 85 86 Use '--help-verbose' for more detailed information. 87 88 Options: 89 EOS"; 90 91 immutable helpTextVerbose = q"EOS 92 Synopsis: tsv-split [options] [file...] 93 94 Split input lines into multiple output files. There are three modes of 95 operation: 96 97 * Fixed number of lines per file (--l|lines-per-file NUM): Each input 98 block of NUM lines is written to a new file. Similar to Unix 'split'. 99 100 * Random assignment (--n|num-files NUM): Each input line is written to a 101 randomly selected output file. Random selection is from NUM files. 102 103 * Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): 104 Input lines are written to output files using fields as a key. Each 105 unique key is randomly assigned to one of NUM output files. All lines 106 with the same key are written to the same file. 107 108 Output files: By default, files are written to the current directory and 109 have names of the form 'part_NNN<suffix>', with 'NNN' being a number and 110 <suffix> being the extension of the first input file. If the input file is 111 'file.txt', the names will take the form 'part_NNN.txt'. The suffix is 112 empty when reading from standard input. The numeric part defaults to 3 113 digits for '--l|lines-per-files'. For '--n|num-files' enough digits are 114 used so all filenames are the same length. The output directory and file 115 names are customizable. 116 117 Header lines: There are two ways to handle input with headers: write a 118 header to all output files (--H|header), or exclude headers from all 119 output files ('--I|header-in-only'). The best choice depends on the 120 follow-up processing. All tsv-utils tools support header lines in multiple 121 input files, but many other tools do not. For example, GNU parallel works 122 best on files without header lines. 123 124 Random assignment (--n|num-files): Random distribution of records to a set 125 of files is a common task. When data fits in memory the preferred approach 126 is usually to shuffle the data and split it into fixed sized blocks. E.g. 127 'tsv-sample data.tsv | tsv-split -l NUM'. However, alternate approaches 128 are needed when data is too large for convenient shuffling. tsv-split's 129 random assignment feature is useful in this case. Each input line is 130 written to a randomly selected output file. Note that output files will 131 have similar but not identical numbers of records. 132 133 Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): This 134 splits a data set into multiple files sharded by key. All lines with the 135 same key are written to the same file. This partitioning enables parallel 136 computation based on the key. For example, statistical calculation 137 ('tsv-summarize --group-by') or duplicate removal ('tsv-uniq --fields'). 138 These operations can be parallelized using tools like GNU parallel, which 139 simplifies concurrent operations on multiple files. 140 141 Random seed: By default, each tsv-split invocation using random assignment 142 or random assignment by key produces different assignments to the output 143 files. Using '--s|static-seed' changes this so multiple runs produce the 144 same assignments. This works by using the same random seed each run. The 145 seed can be specified using '--v|seed-value'. 146 147 Appending to existing files: By default, an error is triggered if an 148 output file already exists. '--a|append' changes this so that lines are 149 appended to existing files. (Header lines are not appended to files with 150 data.) This is useful when adding new data to files created by a previous 151 tsv-split run. Random assignment should use the same '--n|num-files' value 152 each run, but different random seeds (avoid '--s|static-seed'). Random 153 assignment by key should use the same '--n|num-files', '--k|key-fields', 154 and seed ('--s|static-seed' or '--v|seed-value') each run. 155 156 Max number of open files: Random assignment and random assignment by key 157 are dramatically faster when all output files are kept open. However, 158 keeping a large numbers of open files can bump into system limits or limit 159 resources available to other processes. By default, tsv-split uses up to 160 4096 open files or the system per-process limit, whichever is smaller. 161 This can be changed using '--max-open-files', though it cannot be set 162 larger than the system limit. The system limit varies considerably between 163 systems. On many systems it is unlimited. On MacOS it is often set to 256. 164 Use Unix 'ulimit' to display and modify the limits: 165 * 'ulimit -n' - Show the "soft limit". The per-process maximum. 166 * 'ulimit -Hn' - Show the "hard limit". The max allowed soft limit. 167 * 'ulimit -Sn NUM' - Change the "soft limit" to NUM. 168 169 Examples: 170 171 # Split a 10 million line file into 1000 files, 10,000 lines each. 172 # Output files are part_000.txt, part_001.txt, ... part_999.txt. 173 tsv-split data.txt --lines-per-file 10000 174 175 # Same as the previous example, but write files to a subdirectory. 176 tsv-split data.txt --dir split_files --lines-per-file 10000 177 178 # Split a file into 10,000 line files, writing a header line to each 179 tsv-split data.txt -H --lines-per-file 10000 180 181 # Same as the previous example, but dropping the header line. 182 tsv-split data.txt -I --lines-per-file 10000 183 184 # Randomly assign lines to 1000 files 185 tsv-split data.txt --num-files 1000 186 187 # Randomly assign lines to 1000 files while keeping unique keys from 188 # field 3 together. 189 tsv-split data.tsv --num-files 1000 -k 3 190 191 # Randomly assign lines to 1000 files. Later, randomly assign lines 192 # from a second data file to the same output files. 193 tsv-split data1.tsv -n 1000 194 tsv-split data2.tsv -n 1000 --append 195 196 # Randomly assign lines to 1000 files using field 3 as a key. 197 # Later, add a second file to the same output files. 198 tsv-split data1.tsv -n 1000 -k 3 --static-seed 199 tsv-split data2.tsv -n 1000 -k 3 --static-seed --append 200 201 # Change the system per-process open file limit for one command. 202 # The parens create a sub-shell. The current shell is not changed. 203 ( ulimit -Sn 1000 && tsv-split --num-files 1000 data.txt ) 204 205 Options: 206 EOS"; 207 208 /** Container for command line options and derived data. 209 * 210 * TsvSplitOptions handles several aspects of command line options. On the input side, 211 * it defines the command line options available, performs validation, and sets up any 212 * derived state based on the options provided. These activities are handled by the 213 * processArgs() member. 214 * 215 * Once argument processing is complete, TsvSplitOptions is used as a container 216 * holding the specific processing options used by the splitting algorithms. 217 */ 218 struct TsvSplitOptions 219 { 220 import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader; 221 222 enum invalidFileSuffix = "///////"; 223 224 string programName; /// Program name 225 InputSourceRange inputSources; /// Input files 226 bool helpVerbose = false; /// --help-verbose 227 bool headerInOut = false; /// --H|header 228 bool headerIn = false; /// --I|header-in-only 229 size_t linesPerFile = 0; /// --l|lines-per-file 230 uint numFiles = 0; /// --n|num-files 231 size_t[] keyFields; /// --k|key-fields 232 string dir; /// --dir 233 string prefix = "part_"; /// --prefix 234 string suffix = invalidFileSuffix; /// --suffix 235 uint digitWidth = 0; /// --w|digit-width 236 bool appendToExistingFiles = false; /// --a|append 237 bool staticSeed = false; /// --s|static-seed 238 uint seedValueOptionArg = 0; /// --v|seed-value 239 char delim = '\t'; /// --d|delimiter 240 uint maxOpenFilesArg = 0; /// --max-open-files 241 bool versionWanted = false; /// --V|version 242 bool hasHeader = false; /// Derived. True if either '--H|header' or '--I|header-in-only' is set. 243 bool keyIsFullLine = false; /// Derived. True if '--f|fields 0' is specfied. 244 bool usingUnpredictableSeed = true; /// Derived from --static-seed, --seed-value 245 uint seed = 0; /// Derived from --static-seed, --seed-value 246 uint maxOpenOutputFiles; /// Derived. 247 248 /** Process tsv-split command line arguments. 249 * 250 * Defines the command line options, performs validation, and derives additional 251 * state. std.getopt.getopt is called to do the main option processing followed 252 * additional validation and derivation. 253 * 254 * Help text is printed to standard output if help was requested. Error text is 255 * written to stderr if invalid input is encountered. 256 * 257 * A tuple is returned. First value is true if command line arguments were 258 * successfully processed and execution should continue, or false if an error 259 * occurred or the user asked for help. If false, the second value is the 260 * appropriate exit code (0 or 1). 261 * 262 * Returning true (execution continues) means args have been validated and derived 263 * values calculated. Field indices will have been converted to zero-based. 264 */ 265 auto processArgs(ref string[] cmdArgs) 266 { 267 import std.algorithm : all, canFind, each, min; 268 import std.file : exists, isDir; 269 import std.getopt; 270 import std.math : isNaN; 271 import std.path : baseName, expandTilde, extension, stripExtension; 272 import std.typecons : Yes, No; 273 import tsv_utils.common.utils : makeFieldListOptionHandler; 274 275 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 276 277 try 278 { 279 arraySep = ","; // Use comma to separate values in command line options 280 auto r = getopt( 281 cmdArgs, 282 "help-verbose", " Print more detailed help.", &helpVerbose, 283 284 std.getopt.config.caseSensitive, 285 "H|header", " Input files have a header line. Write the header to each output file.", &headerInOut, 286 "I|header-in-only", " Input files have a header line. Do not write the header to output files.", &headerIn, 287 std.getopt.config.caseInsensitive, 288 289 "l|lines-per-file", "NUM Number of lines to write to each output file (excluding the header line).", &linesPerFile, 290 "n|num-files", "NUM Number of output files to generate.", &numFiles, 291 "k|key-fields", "<field-list> Fields to use as key. Lines with the same key are written to the same output file. Use '--k|key-fields 0' to use the entire line as the key.", 292 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero), 293 294 "dir", "STR Directory to write to. Default: Current working directory.", &dir, 295 "prefix", "STR Filename prefix. Default: 'part_'", &prefix, 296 "suffix", "STR Filename suffix. Default: First input file extension. None for standard input.", &suffix, 297 "w|digit-width", "NUM Number of digits in filename numeric portion. Default: '--l|lines-per-file': 3. '--n|num-files': Chosen so filenames have the same length. '--w|digit-width 0' uses the default.", &digitWidth, 298 "a|append", " Append to existing files.", &appendToExistingFiles, 299 300 "s|static-seed", " Use the same random seed every run.", &staticSeed, 301 302 std.getopt.config.caseSensitive, 303 "v|seed-value", "NUM Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg, 304 std.getopt.config.caseInsensitive, 305 306 "d|delimiter", "CHR Field delimiter.", &delim, 307 "max-open-files", "NUM Maximum open file handles to use. Min of 5 required.", &maxOpenFilesArg, 308 309 std.getopt.config.caseSensitive, 310 "V|version", " Print version information and exit.", &versionWanted, 311 std.getopt.config.caseInsensitive, 312 ); 313 314 if (r.helpWanted) 315 { 316 defaultGetoptPrinter(helpText, r.options); 317 return tuple(false, 0); 318 } 319 else if (helpVerbose) 320 { 321 defaultGetoptPrinter(helpTextVerbose, r.options); 322 return tuple(false, 0); 323 } 324 else if (versionWanted) 325 { 326 import tsv_utils.common.tsvutils_version; 327 writeln(tsvutilsVersionNotice("tsv-split")); 328 return tuple(false, 0); 329 } 330 331 /* 332 * Validation and derivations. 333 */ 334 335 enforce(linesPerFile != 0 || numFiles != 0, 336 "Either '--l|lines-per-file' or '--n|num-files' is required."); 337 338 enforce(linesPerFile == 0 || numFiles == 0, 339 "'--l|lines-per-file' and '--n|num-files' cannot be used together."); 340 341 enforce(linesPerFile == 0 || keyFields.length == 0, 342 "'--l|lines-per-file' and '--k|key-fields' cannot be used together."); 343 344 enforce(numFiles != 1, "'--n|num-files must be two or more."); 345 346 if (keyFields.length > 0) 347 { 348 if (keyFields.length == 1 && keyFields[0] == 0) 349 { 350 keyIsFullLine = true; 351 } 352 else 353 { 354 enforce(keyFields.all!(x => x != 0), 355 "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields."); 356 357 keyFields.each!((ref x) => --x); // Convert to zero-based indexing. 358 } 359 } 360 361 enforce(!(headerInOut && headerIn), 362 "Use only one of '--H|header' and '--I|header-in-only'."); 363 364 hasHeader = headerInOut || headerIn; 365 366 if (!dir.empty) 367 { 368 dir = dir.expandTilde; 369 enforce(dir.exists, format("Directory does not exist: --dir '%s'", dir)); 370 enforce(dir.isDir, format("Path is not a directory: --dir '%s'", dir)); 371 } 372 373 /* Seed. */ 374 import std.random : unpredictableSeed; 375 376 usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0); 377 378 if (usingUnpredictableSeed) seed = unpredictableSeed; 379 else if (seedValueOptionArg != 0) seed = seedValueOptionArg; 380 else if (staticSeed) seed = 2438424139; 381 else assert(0, "Internal error, invalid seed option states."); 382 383 /* Maximum number of open files. Mainly applies when --num-files is used. 384 * 385 * Derive maxOpenOutputFiles. Inputs: 386 * - Internal default limit: 4096. This is a somewhat conservative setting. 387 * - rlimit open files limit. Defined by '$ ulimit -n'. 388 * - '--max-open-files' (maxOpenFilesArg). This adjusts the internal limit, 389 * but only up to the rlimit value. 390 * - Four open files are reserved for stdin, stdout, stderr, and one input 391 * file. 392 */ 393 394 immutable uint internalDefaultMaxOpenFiles = 4096; 395 immutable uint numReservedOpenFiles = 4; 396 immutable uint rlimitOpenFilesLimit = rlimitCurrOpenFilesLimit(); 397 398 399 enforce(maxOpenFilesArg == 0 || maxOpenFilesArg > numReservedOpenFiles, 400 format("'--max-open-files' must be at least %d.", 401 numReservedOpenFiles + 1)); 402 403 enforce(maxOpenFilesArg <= rlimitOpenFilesLimit, 404 format("'--max-open-files' value (%d) greater current system limit (%d)." ~ 405 "\nRun 'ulimit -n' to see the soft limit." ~ 406 "\nRun 'ulimit -Hn' to see the hard limit." ~ 407 "\nRun 'ulimit -Sn NUM' to change the soft limit.", 408 maxOpenFilesArg, rlimitOpenFilesLimit)); 409 410 enforce(rlimitOpenFilesLimit > numReservedOpenFiles, 411 format("System open file limit too small. Current value: %d. Must be %d or more." ~ 412 "\nRun 'ulimit -n' to see the soft limit." ~ 413 "\nRun 'ulimit -Hn' to see the hard limit." ~ 414 "\nRun 'ulimit -Sn NUM' to change the soft limit.", 415 rlimitOpenFilesLimit, numReservedOpenFiles + 1)); 416 417 immutable uint openFilesLimit = 418 (maxOpenFilesArg != 0) 419 ? maxOpenFilesArg 420 : min(internalDefaultMaxOpenFiles, rlimitOpenFilesLimit); 421 422 assert(openFilesLimit > numReservedOpenFiles); 423 424 maxOpenOutputFiles = openFilesLimit - numReservedOpenFiles; 425 426 /* Remaining command line args are files. 427 */ 428 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 429 cmdArgs.length = 1; 430 ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader; 431 inputSources = inputSourceRange(filepaths, readHeader); 432 433 /* Suffix - If not provided, use the extension of the first input file. 434 * No suffix if reading from standard input. 435 */ 436 if (suffix == invalidFileSuffix) suffix = filepaths[0].extension; 437 438 /* Ensure forward slash is not included in the filename prefix and suffix. 439 * Forward slash is an invalid Unix filename character. However, open file 440 * calls could match a directory path, resulting in unintended file 441 * creation. 442 * 443 * The other invalid filename character on Unix is the NULL character. 444 * However, the NULL character cannot be entered via Unix command lines, 445 * so there is no need to test for it explicitly. 446 */ 447 enforce(!prefix.canFind('/'), 448 "'--prefix' cannot contain forward slash characters. Use '--dir' to specify an output directory."); 449 450 enforce(!suffix.canFind('/'), 451 "'--suffix' cannot contain forward slash characters. Use '--dir' to specify an output directory."); 452 453 /* Digit width - If not specified, or specified as zero, the width is 454 * determined by the number of files for --num-files, or defaulted to 3 455 * for --lines-per-file. 456 */ 457 if (digitWidth == 0) 458 { 459 if (numFiles > 0) 460 { 461 digitWidth = 1; 462 uint n = numFiles - 1; 463 while (n >= 10) 464 { 465 n /= 10; 466 ++digitWidth; 467 } 468 } 469 else 470 { 471 digitWidth = 3; 472 } 473 } 474 assert(digitWidth != 0); 475 } 476 catch (Exception exc) 477 { 478 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 479 return tuple(false, 1); 480 } 481 return tuple(true, 0); 482 } 483 } 484 485 /* TsvSplitOptions unit tests (command-line argument processing). 486 * 487 * Basic tests. Many cases are covered in executable tests, including all error cases, 488 * as errors write to stderr. 489 */ 490 unittest 491 { 492 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 493 import std.conv : to; 494 import std.file : mkdir, rmdirRecurse; 495 import std.path : buildPath; 496 497 /* A dummy file is used so we don't have to worry about the cases where command 498 * line processing might open a file. Don't want to use stanard input for this, 499 * at least in cases where it might try to read to get the header line. 500 */ 501 auto testDir = makeUnittestTempDir("tsv_split_bylinecount"); 502 scope(exit) testDir.rmdirRecurse; 503 504 string somefile_txt = buildPath(testDir, "somefile.txt"); 505 somefile_txt.File("w").writeln("Hello World!"); 506 507 { 508 auto args = ["unittest", "--lines-per-file", "10", somefile_txt]; 509 TsvSplitOptions cmdopt; 510 const r = cmdopt.processArgs(args); 511 512 assert(cmdopt.linesPerFile == 10); 513 assert(cmdopt.keyFields.empty); 514 assert(cmdopt.numFiles == 0); 515 assert(cmdopt.hasHeader == false); 516 } 517 { 518 auto args = ["unittest", "--num-files", "20", somefile_txt]; 519 TsvSplitOptions cmdopt; 520 const r = cmdopt.processArgs(args); 521 522 assert(cmdopt.linesPerFile == 0); 523 assert(cmdopt.keyFields.empty); 524 assert(cmdopt.numFiles == 20); 525 assert(cmdopt.hasHeader == false); 526 } 527 { 528 auto args = ["unittest", "-n", "5", "--key-fields", "1-3", somefile_txt]; 529 TsvSplitOptions cmdopt; 530 const r = cmdopt.processArgs(args); 531 532 assert(cmdopt.linesPerFile == 0); 533 assert(cmdopt.keyFields == [0, 1, 2]); 534 assert(cmdopt.numFiles == 5); 535 assert(cmdopt.hasHeader == false); 536 assert(cmdopt.keyIsFullLine == false); 537 } 538 { 539 auto args = ["unittest", "-n", "5", "-k", "0", somefile_txt]; 540 TsvSplitOptions cmdopt; 541 const r = cmdopt.processArgs(args); 542 543 assert(cmdopt.linesPerFile == 0); 544 assert(cmdopt.numFiles == 5); 545 assert(cmdopt.hasHeader == false); 546 assert(cmdopt.keyIsFullLine == true); 547 } 548 { 549 auto args = ["unittest", "-n", "2", "--header", somefile_txt]; 550 TsvSplitOptions cmdopt; 551 const r = cmdopt.processArgs(args); 552 553 assert(cmdopt.headerInOut == true); 554 assert(cmdopt.hasHeader == true); 555 assert(cmdopt.headerIn == false); 556 } 557 { 558 auto args = ["unittest", "-n", "2", "--header-in-only", somefile_txt]; 559 TsvSplitOptions cmdopt; 560 const r = cmdopt.processArgs(args); 561 562 assert(cmdopt.headerInOut == false); 563 assert(cmdopt.hasHeader == true); 564 assert(cmdopt.headerIn == true); 565 } 566 567 static void testSuffix(string[] args, string expectedSuffix) 568 { 569 TsvSplitOptions cmdopt; 570 auto savedArgs = args.to!string; 571 const r = cmdopt.processArgs(args); 572 573 assert(r[0], format("[testSuffix] cmdopt.processArgs(%s) returned false.", savedArgs)); 574 assert(cmdopt.suffix == expectedSuffix, 575 format("[testSuffix] Incorrect cmdopt.suffix. Expected: '%s', Actual: '%s'\n cmdopt.processArgs(%s)", 576 expectedSuffix, cmdopt.suffix, savedArgs)); 577 } 578 579 /* In these tests, don't use headers and when files are listed, use 'somefile_txt' first. 580 * This make sure there is no attempt to read standard input and that there won't be an 581 * open failure trying to find a file. 582 */ 583 testSuffix(["unittest", "-n", "2"], ""); 584 testSuffix(["unittest", "-n", "2", "--", "-"], ""); 585 testSuffix(["unittest", "-n", "2", "--suffix", "_123"], "_123"); 586 testSuffix(["unittest", "-n", "2", somefile_txt], ".txt"); 587 testSuffix(["unittest", "-n", "2", somefile_txt, "anotherfile.pqr"], ".txt"); 588 testSuffix(["unittest", "-n", "2", "--suffix", ".X", somefile_txt, "anotherfile.pqr"], ".X"); 589 testSuffix(["unittest", "-n", "2", "--suffix", "", somefile_txt], ""); 590 testSuffix(["unittest", "-n", "2", "--", "-", somefile_txt], ""); 591 testSuffix(["unittest", "-n", "2", "--", somefile_txt, "-"], ".txt"); 592 593 static void testDigitWidth(string[] args, uint expected) 594 { 595 TsvSplitOptions cmdopt; 596 auto savedArgs = args.to!string; 597 const r = cmdopt.processArgs(args); 598 599 assert(r[0], format("[testDigitWidth] cmdopt.processArgs(%s) returned false.", savedArgs)); 600 assert(cmdopt.digitWidth == expected, 601 format("[testDigitWidth] Incorrect cmdopt.digitWidth. Expected: %d, Actual: %d\n cmdopt.processArgs(%s)", 602 expected, cmdopt.digitWidth, savedArgs)); 603 } 604 605 testDigitWidth(["unittest", "-n", "2", somefile_txt], 1); 606 testDigitWidth(["unittest", "-n", "2", "--digit-width" , "0", somefile_txt], 1); 607 testDigitWidth(["unittest", "-n", "10", somefile_txt], 1); 608 testDigitWidth(["unittest", "-n", "11", somefile_txt], 2); 609 testDigitWidth(["unittest", "-n", "555", somefile_txt], 3); 610 testDigitWidth(["unittest", "-n", "555", "--digit-width" , "2", somefile_txt], 2); 611 testDigitWidth(["unittest", "-n", "555", "--digit-width" , "4", somefile_txt], 4); 612 testDigitWidth(["unittest", "-l", "10", somefile_txt], 3); 613 testDigitWidth(["unittest", "-l", "10000", somefile_txt], 3); 614 testDigitWidth(["unittest", "-l", "10000", "--digit-width", "0", somefile_txt], 3); 615 testDigitWidth(["unittest", "-l", "10000", "--digit-width", "1", somefile_txt], 1); 616 testDigitWidth(["unittest", "-l", "10000", "--digit-width", "5", somefile_txt], 5); 617 } 618 619 /** Get the rlimit current number of open files the process is allowed. 620 * 621 * This routine returns the current soft limit on the number of open files the process 622 * is allowed. This is the number returned by the command: '$ ulimit -n'. 623 * 624 * This routine translates this value to a 'uint', as tsv-split uses 'uint' for 625 * tracking output files. The rlimit 'rlim_t' type is usually 'ulong' or 'long'. 626 * RLIM_INFINITY and any value larger than 'uint.max' is translated to 'uint.max'. 627 * 628 * An exception is thrown if call to 'getrlimit' fails. 629 */ 630 uint rlimitCurrOpenFilesLimit() 631 { 632 import core.sys.posix.sys.resource : 633 rlim_t, rlimit, getrlimit, RLIMIT_NOFILE, RLIM_INFINITY, RLIM_SAVED_CUR; 634 import std.conv : to; 635 636 uint currOpenFileLimit = uint.max; 637 638 rlimit rlimitMaxOpenFiles; 639 640 enforce(getrlimit(RLIMIT_NOFILE, &rlimitMaxOpenFiles) == 0, 641 "Internal error: getrlimit call failed"); 642 643 if (rlimitMaxOpenFiles.rlim_cur != RLIM_INFINITY && 644 rlimitMaxOpenFiles.rlim_cur != RLIM_SAVED_CUR && 645 rlimitMaxOpenFiles.rlim_cur >= 0 && 646 rlimitMaxOpenFiles.rlim_cur <= uint.max) 647 { 648 currOpenFileLimit = rlimitMaxOpenFiles.rlim_cur.to!uint; 649 } 650 651 return currOpenFileLimit; 652 } 653 654 /** Invokes the proper split routine based on the command line arguments. 655 * 656 * This routine is the top-level control after command line argument processing is 657 * done. It's primary job is to set up data structures and invoke the correct 658 * processing routine based on the command line arguments. 659 */ 660 void tsvSplit(ref TsvSplitOptions cmdopt) 661 { 662 /* Check that the input files were setup as expected. Should at least have one 663 * input, stdin if nothing else. */ 664 assert(!cmdopt.inputSources.empty); 665 666 if (cmdopt.linesPerFile != 0) 667 { 668 splitByLineCount(cmdopt); 669 } 670 else 671 { 672 /* Randomly distribute input lines to a specified number of files. */ 673 674 auto outputFiles = 675 SplitOutputFiles(cmdopt.numFiles, cmdopt.dir, cmdopt.prefix, cmdopt.suffix, 676 cmdopt.digitWidth, cmdopt.headerInOut, cmdopt.maxOpenOutputFiles, 677 cmdopt.inputSources.front.header); 678 679 if (!cmdopt.appendToExistingFiles) 680 { 681 string existingFile = outputFiles.checkIfFilesExist; 682 enforce(existingFile.length == 0, 683 format("One or more output files already exist. Use '--a|append' to append to existing files. File: '%s'.", 684 existingFile)); 685 } 686 687 if (cmdopt.keyFields.length == 0) 688 { 689 splitLinesRandomly(cmdopt, outputFiles); 690 } 691 else 692 { 693 splitLinesByKey(cmdopt, outputFiles); 694 } 695 } 696 } 697 698 /** A SplitOutputFiles struct holds a collection of output files. 699 * 700 * This struct manages a collection of output files used when writing to multiple 701 * files at once. This includes constructing filenames, opening and closing files, 702 * and writing data and header lines. 703 * 704 * Both random assignment (splitLinesRandomly) and random assignment by key 705 * (splitLinesByKey) use a SplitOutputFiles struct to manage output files. 706 * 707 * The main properties of the output file set are specified in the constuctor. The 708 * exception is the header line. This is not known until the first input file is 709 * read, so it is specified in a separate 'setHeader' call. 710 * 711 * Individual output files are written to based on their zero-based index in the 712 * output collection. The caller selects the output file number to write to and 713 * calls 'writeDataLine' to write a line. The header is written if needed. 714 */ 715 struct SplitOutputFiles 716 { 717 import std.conv : to; 718 import std.file : exists; 719 import std.path : buildPath; 720 import std.stdio : File; 721 722 static struct OutputFile 723 { 724 string filename; 725 File ofile; 726 bool hasData; 727 bool isOpen; // Track separately due to https://github.com/dlang/phobos/pull/7397 728 } 729 730 private uint _numFiles; 731 private bool _writeHeaders; 732 private uint _maxOpenFiles; 733 734 private OutputFile[] _outputFiles; 735 private uint _numOpenFiles = 0; 736 private string _header; 737 738 this(uint numFiles, string dir, string filePrefix, string fileSuffix, 739 uint fileDigitWidth, bool writeHeaders, uint maxOpenFiles, string header) 740 { 741 assert(numFiles >= 2); 742 assert(maxOpenFiles >= 1); 743 744 _numFiles = numFiles; 745 _writeHeaders = writeHeaders; 746 _maxOpenFiles = maxOpenFiles; 747 _header = header; 748 749 _outputFiles.length = numFiles; 750 751 /* Filename assignment. */ 752 foreach (i, ref f; _outputFiles) 753 { 754 f.filename = 755 buildPath(dir, format("%s%.*d%s", filePrefix, fileDigitWidth, i, fileSuffix)); 756 } 757 } 758 759 /* Destructor ensures all files are closed. 760 * 761 * Note: A dual check on whether the file is open is made. This is to avoid a 762 * Phobos bug where std.File doesn't properly maintain the state of open files 763 * if the File.open call fails. See: https://github.com/dlang/phobos/pull/7397. 764 */ 765 ~this() 766 { 767 foreach (ref f; _outputFiles) 768 { 769 if (f.isOpen && f.ofile.isOpen) 770 { 771 assert(_numOpenFiles >= 1); 772 773 f.ofile.close; 774 f.isOpen = false; 775 _numOpenFiles--; 776 } 777 } 778 } 779 780 /* Check if any of the files already exist. 781 * 782 * Returns the empty string if none of the files exist. Otherwise returns the 783 * filename of the first existing file found. This is to facilitate error 784 * message generation. 785 */ 786 string checkIfFilesExist() 787 { 788 foreach (f; _outputFiles) if (f.filename.exists) return f.filename; 789 return ""; 790 } 791 792 /* Picks a random file to close. Used when the open file handle limit has been 793 * reached. 794 */ 795 private void closeSomeFile() 796 { 797 import std.random : uniform; 798 assert(_numOpenFiles > 0); 799 800 immutable uint start = uniform(0, _numFiles); 801 802 foreach (i; cycle(iota(_numFiles), start).take(_numFiles)) 803 { 804 if (_outputFiles[i].isOpen) 805 { 806 _outputFiles[i].ofile.close; 807 _outputFiles[i].isOpen = false; 808 _numOpenFiles--; 809 810 return; 811 } 812 } 813 814 assert(false, "[SplitOutputFiles.closeSomeFile]: Could not find file to close."); 815 } 816 817 /* Write a line to the specified file number. 818 * 819 * A header is written to the file if headers are being written and this is the 820 * first data written to the file. 821 */ 822 void writeDataLine(uint fileNum, const char[] data) 823 { 824 assert(fileNum < _numFiles); 825 assert(fileNum < _outputFiles.length); 826 assert(_numOpenFiles <= _maxOpenFiles); 827 828 OutputFile* outputFile = &_outputFiles[fileNum]; 829 830 if (!outputFile.isOpen) 831 { 832 if (_numOpenFiles == _maxOpenFiles) closeSomeFile(); 833 assert(_numOpenFiles < _maxOpenFiles); 834 835 outputFile.ofile = outputFile.filename.File("a"); 836 outputFile.isOpen = true; 837 _numOpenFiles++; 838 839 if (!outputFile.hasData) 840 { 841 ulong filesize = outputFile.ofile.size; 842 outputFile.hasData = (filesize > 0 && filesize != ulong.max); 843 } 844 } 845 846 if (_writeHeaders && !outputFile.hasData) outputFile.ofile.writeln(_header); 847 848 outputFile.ofile.writeln(data); 849 outputFile.hasData = true; 850 } 851 } 852 853 /** Write input lines to multiple files, randomly selecting an output file for each line. 854 */ 855 void splitLinesRandomly(ref TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles) 856 { 857 import std.random : Random = Mt19937, uniform; 858 import tsv_utils.common.utils : bufferedByLine, InputSourceRange; 859 860 /* inputSources must be an InputSourceRange and include at least stdin. */ 861 assert(!cmdopt.inputSources.empty); 862 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 863 864 auto randomGenerator = Random(cmdopt.seed); 865 866 /* Process each line. */ 867 foreach (inputStream; cmdopt.inputSources) 868 { 869 foreach (line; inputStream.file.bufferedByLine) 870 { 871 immutable uint outputFileNum = uniform(0, cmdopt.numFiles, randomGenerator); 872 outputFiles.writeDataLine(outputFileNum, line); 873 } 874 } 875 } 876 877 /** Write input lines to multiple output files using fields as a random selection key. 878 * 879 * Each input line is written to an output file. The output file is chosen using 880 * fields as a key. Each unique key is assigned to a file. All lines having the 881 * same key are written to the same file. 882 */ 883 void splitLinesByKey(ref TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles) 884 { 885 import std.algorithm : splitter; 886 import std.conv : to; 887 import std.digest.murmurhash; 888 import tsv_utils.common.utils : bufferedByLine, InputFieldReordering, 889 InputSourceRange, throwIfWindowsNewlineOnUnix; 890 891 assert(cmdopt.keyFields.length > 0); 892 893 /* inputSources must be an InputSourceRange and include at least stdin. */ 894 assert(!cmdopt.inputSources.empty); 895 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 896 897 immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys. 898 899 /* Create a mapping for the key fields. */ 900 auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields); 901 902 /* Process each line. */ 903 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 904 foreach (inputStream; cmdopt.inputSources) 905 { 906 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 907 908 foreach (fileLineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine)) 909 { 910 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); 911 912 /* Murmurhash works by successively adding individual keys, then finalizing. 913 * Adding individual keys is simpler if the full-line-as-key and individual 914 * fields as keys cases are separated. 915 */ 916 auto hasher = MurmurHash3!32(cmdopt.seed); 917 918 if (cmdopt.keyIsFullLine) 919 { 920 hasher.put(cast(ubyte[]) line); 921 } 922 else 923 { 924 assert(keyFieldsReordering !is null); 925 926 /* Gather the key field values and assemble the key. */ 927 keyFieldsReordering.initNewLine; 928 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 929 { 930 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 931 if (keyFieldsReordering.allFieldsFilled) break; 932 } 933 934 enforce(keyFieldsReordering.allFieldsFilled, 935 format("Not enough fields in line. File: %s, Line: %s", 936 inputStream.name, fileLineNum)); 937 938 foreach (count, key; keyFieldsReordering.outputFields.enumerate) 939 { 940 if (count > 0) hasher.put(delimArray); 941 hasher.put(cast(ubyte[]) key); 942 } 943 } 944 945 hasher.finish; 946 immutable uint outputFileNum = hasher.get % cmdopt.numFiles; 947 outputFiles.writeDataLine(outputFileNum, line); 948 } 949 } 950 } 951 952 /** Write input lines to multiple files, splitting based on line count. 953 * 954 * Note: readBufferSize is an argument primarily for unit test purposes. Normal uses 955 * should use the default value. 956 */ 957 void splitByLineCount(ref TsvSplitOptions cmdopt, const size_t readBufferSize = 1024L * 128L) 958 { 959 import std.file : exists; 960 import std.path : buildPath; 961 import std.stdio : File; 962 import tsv_utils.common.utils : InputSourceRange; 963 964 assert (readBufferSize > 0); 965 ubyte[] readBuffer = new ubyte[readBufferSize]; 966 967 /* inputSources must be an InputSourceRange and include at least stdin. */ 968 assert(!cmdopt.inputSources.empty); 969 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 970 971 string header = !cmdopt.headerInOut ? "" : 972 cmdopt.inputSources.front.header(Yes.keepTerminator); 973 size_t nextOutputFileNum = 0; 974 File outputFile; 975 string outputFileName; 976 bool isOutputFileOpen = false; // Open file status tracked separately due to phobos bugs 977 size_t outputFileRemainingLines; 978 979 /* nextNewlineIndex finds the index of the next newline character. It is an 980 * alternative to std.algorithm.countUntil. Invoking 'find' directly results 981 * 'memchr' being used (faster). The current 'countUntil' implementation does 982 * forward to find, but the way it is done avoids the memchr call optimization. 983 */ 984 static long nextNewlineIndex(const ubyte[] buffer) 985 { 986 import std.algorithm : find; 987 immutable ubyte newlineChar = '\n'; 988 immutable size_t buflen = buffer.length; 989 immutable size_t findlen = buffer.find(newlineChar).length; 990 991 return findlen > 0 ? buflen - findlen : -1; 992 } 993 994 foreach (inputStream; cmdopt.inputSources) 995 { 996 foreach (ref ubyte[] inputChunk; inputStream.file.byChunk(readBuffer)) 997 { 998 size_t nextOutputChunkStart = 0; 999 auto remainingInputChunk = inputChunk[nextOutputChunkStart .. $]; 1000 1001 while (!remainingInputChunk.empty) 1002 { 1003 /* See if the next output file needs to be opened. */ 1004 if (!isOutputFileOpen) 1005 { 1006 outputFileName = 1007 buildPath(cmdopt.dir, 1008 format("%s%.*d%s", cmdopt.prefix, 1009 cmdopt.digitWidth, nextOutputFileNum, cmdopt.suffix)); 1010 1011 enforce(cmdopt.appendToExistingFiles || !outputFileName.exists, 1012 format("Output file already exists. Use '--a|append' to append to existing files. File: '%s'.", 1013 outputFileName)); 1014 1015 outputFile = outputFileName.File("ab"); 1016 outputFile.setvbuf(1024L * 64L, _IOFBF); 1017 isOutputFileOpen = true; 1018 ++nextOutputFileNum; 1019 outputFileRemainingLines = cmdopt.linesPerFile; 1020 1021 if (cmdopt.headerInOut) 1022 { 1023 ulong filesize = outputFile.size; 1024 if (filesize == 0 || filesize == ulong.max) outputFile.rawWrite(header); 1025 } 1026 } 1027 1028 /* Find more newlines for the current output file. */ 1029 1030 assert(outputFileRemainingLines > 0); 1031 1032 size_t nextOutputChunkEnd = nextOutputChunkStart; 1033 1034 while (outputFileRemainingLines != 0 && !remainingInputChunk.empty) 1035 { 1036 /* Note: newLineIndex is relative to 'remainingInputChunk', not 1037 * 'inputChunk'. Updates to variables referring to 'inputChunk' 1038 * need to reflect this. In particular, 'nextOutputChunkEnd'. 1039 */ 1040 immutable newlineIndex = nextNewlineIndex(remainingInputChunk); 1041 1042 if (newlineIndex == -1) 1043 { 1044 nextOutputChunkEnd = inputChunk.length; 1045 } 1046 else 1047 { 1048 --outputFileRemainingLines; 1049 nextOutputChunkEnd += (newlineIndex + 1); 1050 } 1051 1052 remainingInputChunk = inputChunk[nextOutputChunkEnd .. $]; 1053 } 1054 1055 assert(nextOutputChunkStart < nextOutputChunkEnd); 1056 assert(nextOutputChunkEnd <= inputChunk.length); 1057 1058 outputFile.rawWrite(inputChunk[nextOutputChunkStart .. nextOutputChunkEnd]); 1059 1060 if (outputFileRemainingLines == 0) 1061 { 1062 outputFile.close; 1063 isOutputFileOpen = false; 1064 } 1065 1066 nextOutputChunkStart = nextOutputChunkEnd; 1067 1068 assert(remainingInputChunk.length == inputChunk.length - nextOutputChunkStart); 1069 } 1070 } 1071 } 1072 } 1073 1074 /* splitByLineCount unit tests. 1075 * 1076 * These tests are primarily for buffer management. There are edge cases involving the 1077 * interaction buffer size, input file size, lines-per-file, and newline placement 1078 * that are difficult to test against the executable. 1079 */ 1080 unittest 1081 { 1082 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 1083 import std.algorithm : min; 1084 import std.array : appender; 1085 import std.conv : to; 1086 import std.file : exists, mkdir, rmdirRecurse; 1087 import std.path : buildPath; 1088 import std.process : escapeShellCommand, executeShell; 1089 1090 /* Test setup 1091 * 1092 * A set of twenty file input files is created, with names: input_NxM.txt, where 1093 * N is the number of characters in each row and M is the number of rows (lines). 1094 * The resulting files are put in the "lc_input" directory ('inputDir' variable) 1095 * and have names: 1096 * input_0x2.txt, input_0x3.txt, ... input_5x5.txt. 1097 * 1098 * A standalone block of code produces the expected result files for splitting an 1099 * input file into a set of output files. This duplicates the splitByLineCount 1100 * output. This is done for lines-per-file counts 1 to 5. Each result set is place 1101 * ina subdirectory under "lc_expected" ('expectedDir' variable). Subdirectories 1102 * have names like: "0x2_by_1", "0x3_by_1", ..., "5x5_by_4". 1103 * 1104 * splitByLine is called for all the same input files and lines-per-file settings used 1105 * to produce the expected output. This is done via testSplitByLineCount, which calls 1106 * command line argument processing and splitByLine, similar to how the main program 1107 * works. The results are written to a subdirectory. The subdirectory is compared to 1108 * the expected output directory using the system 'diff' command. 1109 * 1110 * splitByLine is multiple times for each expected output case. The different calls 1111 * iterate over a series of small ReadBufferSizes. This is how tests for edge cases 1112 * in the readBufferSize vs line lengths, newline placement, etc., is accomplished. 1113 * 1114 * Note: One way to understand what is going on is to comment out the line: 1115 * 1116 * scope(exit) testDir.rmdirRecurse; 1117 * 1118 * Then run the test (e.g. 'make test') and look at the directory structure left 1119 * behind. Print out the 'testDir' directory to see where it is located. 1120 */ 1121 1122 /* testSplitByLineCount acts as a surrogate for main() and tsvSplit(). It makes the 1123 * call to splitByLineCount and calls 'diff' to compare the output directory to the 1124 * expected directory. An assert is thrown if the directories do not match. 1125 */ 1126 static void testSplitByLineCount(string[] cmdArgs, string expectedDir, 1127 size_t readBufferSize = 1024L * 512L) 1128 { 1129 import std.array : appender; 1130 1131 assert(cmdArgs.length > 0, "[testSplitByLineCount] cmdArgs must not be empty."); 1132 1133 auto formatAssertMessage(T...)(string msg, T formatArgs) 1134 { 1135 auto formatString = "[testSplitByLineCount] %s: " ~ msg; 1136 return format(formatString, cmdArgs[0], formatArgs); 1137 } 1138 1139 TsvSplitOptions cmdopt; 1140 auto savedCmdArgs = cmdArgs.to!string; 1141 auto r = cmdopt.processArgs(cmdArgs); 1142 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 1143 assert(cmdopt.linesPerFile != 0, "[testSplitByLineCount] --lines-per-file is required."); 1144 assert(!cmdopt.dir.empty, "[testSplitByLineCount] --dir is required."); 1145 1146 splitByLineCount(cmdopt, readBufferSize); 1147 1148 /* Diff command setup. */ 1149 auto diffCmdArgs = ["diff", expectedDir, cmdopt.dir]; 1150 auto diffResult = executeShell(escapeShellCommand(diffCmdArgs)); 1151 assert(diffResult.status == 0, 1152 format("[testSplitByLineCount]\n cmd: %s\n readBufferSize: %d\n expectedDir: %s\n------ Diff ------%s\n-------", 1153 savedCmdArgs, readBufferSize, expectedDir, diffResult.output)); 1154 } 1155 1156 auto testDir = makeUnittestTempDir("tsv_split_bylinecount"); 1157 scope(exit) testDir.rmdirRecurse; 1158 1159 auto inputDir = buildPath(testDir, "lc_input"); 1160 auto outputDir = buildPath(testDir, "lc_output"); 1161 auto expectedDir = buildPath(testDir, "lc_expected"); 1162 1163 mkdir(inputDir); 1164 mkdir(outputDir); 1165 mkdir(expectedDir); 1166 1167 static string buildInputFilePath(string dir, long inputLineLength, long inputFileNumLines) 1168 { 1169 return buildPath(dir, format("input_%dx%d.txt", inputLineLength, inputFileNumLines)); 1170 } 1171 1172 string[5] outputRowData = 1173 [ 1174 "abcde", 1175 "fghij", 1176 "klmno", 1177 "pqrst", 1178 "uvwxy" 1179 ]; 1180 1181 /* The main test loop. Iterates over input line lengths, numbers of rows, 1182 * lines-per-file, and finally readBufferSize lengths. All combos are tested. 1183 */ 1184 foreach (inputLineLength; 0 .. 6) 1185 { 1186 foreach (inputFileNumLines; 2 .. 6) 1187 { 1188 auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines); 1189 1190 { 1191 auto ofile = inputFile.File("w"); 1192 auto output = appender!(char[])(); 1193 foreach (m; 0 .. inputFileNumLines) 1194 { 1195 put(output, outputRowData[m][0 .. inputLineLength]); 1196 put(output, '\n'); 1197 } 1198 ofile.write(output.data); 1199 ofile.close; 1200 } 1201 1202 /* Iterate over the different lines-per-file lengths. 1203 * - Create an expected output directory and files for each. 1204 * - Test with different readBufferSize values. 1205 */ 1206 foreach (outputFileNumLines; 1 .. min(5, inputFileNumLines)) 1207 { 1208 auto expectedSubDir = 1209 buildPath(expectedDir, format("%dx%d_by_%d", inputLineLength, 1210 inputFileNumLines, outputFileNumLines)); 1211 mkdir(expectedSubDir); 1212 1213 size_t filenum = 0; 1214 size_t linesWritten = 0; 1215 while (linesWritten < inputFileNumLines) 1216 { 1217 auto expectedFile = buildPath(expectedSubDir, format("part_%d.txt", filenum)); 1218 auto f = expectedFile.File("w"); 1219 auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten); 1220 foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite]) 1221 { 1222 f.writeln(line[0 .. inputLineLength]); 1223 } 1224 linesWritten += linesToWrite; 1225 ++filenum; 1226 f.close; 1227 } 1228 1229 /* Test the different readBufferSizes. 1230 * - An output directory is created for the run and deleted afterward. 1231 * - First test the default size. 1232 * - Then iterate overs small readBufferSize values. 1233 */ 1234 auto outputSubDir = 1235 buildPath(outputDir, format("%dx%d_by_%d", inputLineLength, 1236 inputFileNumLines, outputFileNumLines)); 1237 mkdir(outputSubDir); 1238 1239 testSplitByLineCount( 1240 ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir, 1241 "--digit-width", "1", inputFile], 1242 expectedSubDir); 1243 1244 outputSubDir.rmdirRecurse; 1245 1246 foreach (readBufSize; 1 .. 8) 1247 { 1248 mkdir(outputSubDir); 1249 1250 testSplitByLineCount( 1251 ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir, 1252 "--digit-width", "1", inputFile], 1253 expectedSubDir, readBufSize); 1254 1255 outputSubDir.rmdirRecurse; 1256 } 1257 } 1258 } 1259 } 1260 1261 { 1262 /* Tests for the special case where readBufferSize is smaller than the header 1263 * line. We'll reuse the input_5x4.txt input file and write 1 line-per-file. 1264 */ 1265 immutable inputLineLength = 5; 1266 immutable inputFileNumLines = 4; 1267 immutable outputFileNumLines = 1; 1268 1269 auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines); 1270 assert(inputFile.exists); 1271 1272 auto expectedSubDirHeader = 1273 buildPath(expectedDir, format("%dx%d_by_%d_header", inputLineLength, 1274 inputFileNumLines, outputFileNumLines)); 1275 1276 auto expectedSubDirHeaderInOnly = 1277 buildPath(expectedDir, format("%dx%d_by_%d_header_in_only", inputLineLength, 1278 inputFileNumLines, outputFileNumLines)); 1279 1280 mkdir(expectedSubDirHeader); 1281 mkdir(expectedSubDirHeaderInOnly); 1282 1283 /* Generate the expected results. Cheat by starting with linesWritten = 1. This 1284 * automatically excludes the header line, but keeps the loop code consistent 1285 * with the main test loop. 1286 */ 1287 size_t filenum = 0; 1288 size_t linesWritten = 1; 1289 while (linesWritten < inputFileNumLines) 1290 { 1291 auto expectedFileHeader = buildPath(expectedSubDirHeader, format("part_%d.txt", filenum)); 1292 auto expectedFileHeaderInOnly = buildPath(expectedSubDirHeaderInOnly, 1293 format("part_%d.txt", filenum)); 1294 auto fHeader = expectedFileHeader.File("w"); 1295 auto fHeaderInOnly = expectedFileHeaderInOnly.File("w"); 1296 auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten); 1297 1298 fHeader.writeln(outputRowData[0][0 .. inputLineLength]); 1299 foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite]) 1300 { 1301 fHeader.writeln(line[0 .. inputLineLength]); 1302 fHeaderInOnly.writeln(line[0 .. inputLineLength]); 1303 } 1304 linesWritten += linesToWrite; 1305 ++filenum; 1306 fHeader.close; 1307 fHeaderInOnly.close; 1308 } 1309 1310 /* Now run the tests. */ 1311 auto outputSubDirHeader = 1312 buildPath(outputDir, format("%dx%d_by_%d_header", inputLineLength, 1313 inputFileNumLines, outputFileNumLines)); 1314 auto outputSubDirHeaderInOnly = 1315 buildPath(outputDir, format("%dx%d_by_%d_header_in_only", inputLineLength, 1316 inputFileNumLines, outputFileNumLines)); 1317 1318 foreach (readBufSize; 1 .. 6) 1319 { 1320 mkdir(outputSubDirHeader); 1321 mkdir(outputSubDirHeaderInOnly); 1322 1323 testSplitByLineCount( 1324 ["test", "--header", "--lines-per-file", outputFileNumLines.to!string, 1325 "--dir", outputSubDirHeader, "--digit-width", "1", inputFile], 1326 expectedSubDirHeader, readBufSize); 1327 1328 testSplitByLineCount( 1329 ["test", "--header-in-only", "--lines-per-file", outputFileNumLines.to!string, 1330 "--dir", outputSubDirHeaderInOnly, "--digit-width", "1", inputFile], 1331 expectedSubDirHeaderInOnly, readBufSize); 1332 1333 outputSubDirHeader.rmdirRecurse; 1334 outputSubDirHeaderInOnly.rmdirRecurse; 1335 } 1336 } 1337 }