1 /** 2 Command line tool for splitting a files (or files) into multiple output files. 3 Several methods for splitting are available, including splitting by line count, 4 splitting by random assignment, and splitting by random assignment based on 5 key fields. 6 7 Copyright (c) 2020-2021, eBay Inc. 8 Initially written by Jon Degenhardt 9 10 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 11 */ 12 module tsv_utils.tsv_split; 13 14 import std.exception : enforce; 15 import std.format : format; 16 import std.range; 17 import std.stdio; 18 import std.typecons : tuple, Flag; 19 20 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 21 22 version(unittest) 23 { 24 // When running unit tests, use main from -main compiler switch. 25 } 26 else 27 { 28 /** Main program. 29 * 30 * Invokes command line argument processing and calls tsvSplit to do the real 31 * work. Errors occurring during processing are caught and reported to the user. 32 */ 33 int main(string[] cmdArgs) 34 { 35 /* When running in DMD code coverage mode, turn on report merging. */ 36 version(D_Coverage) version(DigitalMars) 37 { 38 import core.runtime : dmd_coverSetMerge; 39 dmd_coverSetMerge(true); 40 } 41 42 TsvSplitOptions cmdopt; 43 const r = cmdopt.processArgs(cmdArgs); 44 if (!r[0]) return r[1]; 45 version(LDC_Profile) 46 { 47 import ldc.profile : resetAll; 48 resetAll(); 49 } 50 try 51 { 52 tsvSplit(cmdopt); 53 } 54 catch (Exception exc) 55 { 56 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 57 return 1; 58 } 59 return 0; 60 } 61 } 62 63 immutable helpText = q"EOS 64 Synopsis: tsv-split [options] [file...] 65 66 Split input lines into multiple output files. There are three modes of 67 operation: 68 69 * Fixed number of lines per file (--l|lines-per-file NUM): Each input 70 block of NUM lines is written to a new file. Similar to Unix 'split'. 71 72 * Random assignment (--n|num-files NUM): Each input line is written to a 73 randomly selected output file. Random selection is from NUM files. 74 75 * Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): 76 Input lines are written to output files using fields as a key. Each 77 unique key is randomly assigned to one of NUM output files. All lines 78 with the same key are written to the same file. 79 80 By default, files are written to the current directory and have names 81 of the form 'part_NNN<suffix>', with 'NNN' being a number and <suffix> 82 being the extension of the first input file. If the input file is 83 'file.txt', the names will take the form 'part_NNN.txt'. The output 84 directory and file names are customizable. 85 86 Fields are specified using field number or field name. Field names 87 require that the input file has a header line. 88 89 Use '--help-verbose' for more detailed information. 90 91 Options: 92 EOS"; 93 94 immutable helpTextVerbose = q"EOS 95 Synopsis: tsv-split [options] [file...] 96 97 Split input lines into multiple output files. There are three modes of 98 operation: 99 100 * Fixed number of lines per file (--l|lines-per-file NUM): Each input 101 block of NUM lines is written to a new file. Similar to Unix 'split'. 102 103 * Random assignment (--n|num-files NUM): Each input line is written to a 104 randomly selected output file. Random selection is from NUM files. 105 106 * Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): 107 Input lines are written to output files using fields as a key. Each 108 unique key is randomly assigned to one of NUM output files. All lines 109 with the same key are written to the same file. 110 111 Output files: By default, files are written to the current directory and 112 have names of the form 'part_NNN<suffix>', with 'NNN' being a number and 113 <suffix> being the extension of the first input file. If the input file is 114 'file.txt', the names will take the form 'part_NNN.txt'. The suffix is 115 empty when reading from standard input. The numeric part defaults to 3 116 digits for '--l|lines-per-files'. For '--n|num-files' enough digits are 117 used so all filenames are the same length. The output directory and file 118 names are customizable. 119 120 Header lines: There are two ways to handle input with headers: write a 121 header to all output files (--H|header), or exclude headers from all 122 output files ('--I|header-in-only'). The best choice depends on the 123 follow-up processing. All tsv-utils tools support header lines in multiple 124 input files, but many other tools do not. For example, GNU parallel works 125 best on files without header lines. 126 127 Random assignment (--n|num-files): Random distribution of records to a set 128 of files is a common task. When data fits in memory the preferred approach 129 is usually to shuffle the data and split it into fixed sized blocks. E.g. 130 'tsv-sample data.tsv | tsv-split -l NUM'. However, alternate approaches 131 are needed when data is too large for convenient shuffling. tsv-split's 132 random assignment feature is useful in this case. Each input line is 133 written to a randomly selected output file. Note that output files will 134 have similar but not identical numbers of records. 135 136 Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): This 137 splits a data set into multiple files sharded by key. All lines with the 138 same key are written to the same file. This partitioning enables parallel 139 computation based on the key. For example, statistical calculation 140 ('tsv-summarize --group-by') or duplicate removal ('tsv-uniq --fields'). 141 These operations can be parallelized using tools like GNU parallel, which 142 simplifies concurrent operations on multiple files. Fields are specified 143 using field number or field name. Field names require that the input file 144 has a header line. Use '--help-fields' for details about field names. 145 146 Random seed: By default, each tsv-split invocation using random assignment 147 or random assignment by key produces different assignments to the output 148 files. Using '--s|static-seed' changes this so multiple runs produce the 149 same assignments. This works by using the same random seed each run. The 150 seed can be specified using '--v|seed-value'. 151 152 Appending to existing files: By default, an error is triggered if an 153 output file already exists. '--a|append' changes this so that lines are 154 appended to existing files. (Header lines are not appended to files with 155 data.) This is useful when adding new data to files created by a previous 156 tsv-split run. Random assignment should use the same '--n|num-files' value 157 each run, but different random seeds (avoid '--s|static-seed'). Random 158 assignment by key should use the same '--n|num-files', '--k|key-fields', 159 and seed ('--s|static-seed' or '--v|seed-value') each run. 160 161 Max number of open files: Random assignment and random assignment by key 162 are dramatically faster when all output files are kept open. However, 163 keeping a large numbers of open files can bump into system limits or limit 164 resources available to other processes. By default, tsv-split uses up to 165 4096 open files or the system per-process limit, whichever is smaller. 166 This can be changed using '--max-open-files', though it cannot be set 167 larger than the system limit. The system limit varies considerably between 168 systems. On many systems it is unlimited. On MacOS it is often set to 256. 169 Use Unix 'ulimit' to display and modify the limits: 170 * 'ulimit -n' - Show the "soft limit". The per-process maximum. 171 * 'ulimit -Hn' - Show the "hard limit". The max allowed soft limit. 172 * 'ulimit -Sn NUM' - Change the "soft limit" to NUM. 173 174 Examples: 175 176 # Split a 10 million line file into 1000 files, 10,000 lines each. 177 # Output files are part_000.txt, part_001.txt, ... part_999.txt. 178 tsv-split data.txt --lines-per-file 10000 179 180 # Same as the previous example, but write files to a subdirectory. 181 tsv-split data.txt --dir split_files --lines-per-file 10000 182 183 # Split a file into 10,000 line files, writing a header line to each 184 tsv-split data.txt -H --lines-per-file 10000 185 186 # Same as the previous example, but dropping the header line. 187 tsv-split data.txt -I --lines-per-file 10000 188 189 # Randomly assign lines to 1000 files 190 tsv-split data.txt --num-files 1000 191 192 # Randomly assign lines to 1000 files while keeping unique entries 193 # from the 'url' field together. 194 tsv-split data.tsv -H -k url --num-files 1000 195 196 # Randomly assign lines to 1000 files. Later, randomly assign lines 197 # from a second data file to the same output files. 198 tsv-split data1.tsv -n 1000 199 tsv-split data2.tsv -n 1000 --append 200 201 # Randomly assign lines to 1000 files using field 3 as a key. 202 # Later, add a second file to the same output files. 203 tsv-split data1.tsv -n 1000 -k 3 --static-seed 204 tsv-split data2.tsv -n 1000 -k 3 --static-seed --append 205 206 # Change the system per-process open file limit for one command. 207 # The parens create a sub-shell. The current shell is not changed. 208 ( ulimit -Sn 1000 && tsv-split --num-files 1000 data.txt ) 209 210 Options: 211 EOS"; 212 213 /** Container for command line options and derived data. 214 * 215 * TsvSplitOptions handles several aspects of command line options. On the input side, 216 * it defines the command line options available, performs validation, and sets up any 217 * derived state based on the options provided. These activities are handled by the 218 * processArgs() member. 219 * 220 * Once argument processing is complete, TsvSplitOptions is used as a container 221 * holding the specific processing options used by the splitting algorithms. 222 */ 223 struct TsvSplitOptions 224 { 225 import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader; 226 227 enum invalidFileSuffix = "///////"; 228 229 string programName; /// Program name 230 InputSourceRange inputSources; /// Input files 231 bool headerInOut = false; /// --H|header 232 bool headerIn = false; /// --I|header-in-only 233 size_t linesPerFile = 0; /// --l|lines-per-file 234 uint numFiles = 0; /// --n|num-files 235 size_t[] keyFields; /// Derived: --k|key-fields 236 string dir; /// --dir 237 string prefix = "part_"; /// --prefix 238 string suffix = invalidFileSuffix; /// --suffix 239 uint digitWidth = 0; /// --w|digit-width 240 bool appendToExistingFiles = false; /// --a|append 241 bool staticSeed = false; /// --s|static-seed 242 uint seedValueOptionArg = 0; /// --v|seed-value 243 char delim = '\t'; /// --d|delimiter 244 uint maxOpenFilesArg = 0; /// --max-open-files 245 bool hasHeader = false; /// Derived. True if either '--H|header' or '--I|header-in-only' is set. 246 bool keyIsFullLine = false; /// Derived. True if '--f|fields 0' is specfied. 247 bool usingUnpredictableSeed = true; /// Derived from --static-seed, --seed-value 248 uint seed = 0; /// Derived from --static-seed, --seed-value 249 uint maxOpenOutputFiles; /// Derived. 250 251 /** Process tsv-split command line arguments. 252 * 253 * Defines the command line options, performs validation, and derives additional 254 * state. std.getopt.getopt is called to do the main option processing followed 255 * additional validation and derivation. 256 * 257 * Help text is printed to standard output if help was requested. Error text is 258 * written to stderr if invalid input is encountered. 259 * 260 * A tuple is returned. First value is true if command line arguments were 261 * successfully processed and execution should continue, or false if an error 262 * occurred or the user asked for help. If false, the second value is the 263 * appropriate exit code (0 or 1). 264 * 265 * Returning true (execution continues) means args have been validated and derived 266 * values calculated. Field indices will have been converted to zero-based. 267 */ 268 auto processArgs(ref string[] cmdArgs) 269 { 270 import std.algorithm : all, canFind, each, min; 271 import std.conv : to; 272 import std.file : exists, isDir; 273 import std.getopt; 274 import std.math : isNaN; 275 import std.path : baseName, expandTilde, extension, stripExtension; 276 import std.typecons : Yes, No; 277 import tsv_utils.common.fieldlist; 278 279 bool helpVerbose = false; // --help-verbose 280 bool helpFields = false; // --help-fields 281 bool versionWanted = false; // --V|version 282 string keyFieldsArg; // --k|key-fields 283 284 string keyFieldsOptionString = "k|key-fields"; 285 286 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 287 288 try 289 { 290 arraySep = ","; // Use comma to separate values in command line options 291 auto r = getopt( 292 cmdArgs, 293 "help-verbose", " Print more detailed help.", &helpVerbose, 294 "help-fields", " Print help on specifying fields.", &helpFields, 295 296 std.getopt.config.caseSensitive, 297 "H|header", " Input files have a header line. Write the header to each output file.", &headerInOut, 298 "I|header-in-only", " Input files have a header line. Do not write the header to output files.", &headerIn, 299 std.getopt.config.caseInsensitive, 300 301 "l|lines-per-file", "NUM Number of lines to write to each output file (excluding the header line).", &linesPerFile, 302 "n|num-files", "NUM Number of output files to generate.", &numFiles, 303 304 keyFieldsOptionString, 305 "<field-list> Fields to use as key. Lines with the same key are written to the same output file. Use '--k|key-fields 0' to use the entire line as the key.", 306 &keyFieldsArg, 307 308 "dir", "STR Directory to write to. Default: Current working directory.", &dir, 309 "prefix", "STR Filename prefix. Default: 'part_'", &prefix, 310 "suffix", "STR Filename suffix. Default: First input file extension. None for standard input.", &suffix, 311 "w|digit-width", "NUM Number of digits in filename numeric portion. Default: '--l|lines-per-file': 3. '--n|num-files': Chosen so filenames have the same length. '--w|digit-width 0' uses the default.", &digitWidth, 312 "a|append", " Append to existing files.", &appendToExistingFiles, 313 314 "s|static-seed", " Use the same random seed every run.", &staticSeed, 315 316 std.getopt.config.caseSensitive, 317 "v|seed-value", "NUM Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg, 318 std.getopt.config.caseInsensitive, 319 320 "d|delimiter", "CHR Field delimiter.", &delim, 321 "max-open-files", "NUM Maximum open file handles to use. Min of 5 required.", &maxOpenFilesArg, 322 323 std.getopt.config.caseSensitive, 324 "V|version", " Print version information and exit.", &versionWanted, 325 std.getopt.config.caseInsensitive, 326 ); 327 328 if (r.helpWanted) 329 { 330 defaultGetoptPrinter(helpText, r.options); 331 return tuple(false, 0); 332 } 333 else if (helpVerbose) 334 { 335 defaultGetoptPrinter(helpTextVerbose, r.options); 336 return tuple(false, 0); 337 } 338 else if (helpFields) 339 { 340 writeln(fieldListHelpText); 341 return tuple(false, 0); 342 } 343 else if (versionWanted) 344 { 345 import tsv_utils.common.tsvutils_version; 346 writeln(tsvutilsVersionNotice("tsv-split")); 347 return tuple(false, 0); 348 } 349 350 /* Remaining command line args are files. 351 */ 352 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 353 cmdArgs.length = 1; 354 355 /* Validation and derivations - Do as much validation prior to header line 356 * processing as possible (avoids waiting on stdin). 357 * 358 * Note: keyFields depends on header line processing, but keyFieldsArg 359 * can be used to detect whether the command line argument was specified. 360 */ 361 362 enforce(!(headerInOut && headerIn), 363 "Use only one of '--H|header' and '--I|header-in-only'."); 364 365 hasHeader = headerInOut || headerIn; 366 367 enforce(linesPerFile != 0 || numFiles != 0, 368 "Either '--l|lines-per-file' or '--n|num-files' is required."); 369 370 enforce(linesPerFile == 0 || numFiles == 0, 371 "'--l|lines-per-file' and '--n|num-files' cannot be used together."); 372 373 enforce(linesPerFile == 0 || keyFieldsArg.length == 0, 374 "'--l|lines-per-file' and '--k|key-fields' cannot be used together."); 375 376 enforce(numFiles != 1, "'--n|num-files must be two or more."); 377 378 if (!dir.empty) 379 { 380 dir = dir.expandTilde; 381 enforce(dir.exists, format("Directory does not exist: --dir '%s'", dir)); 382 enforce(dir.isDir, format("Path is not a directory: --dir '%s'", dir)); 383 } 384 385 /* Seed. */ 386 import std.random : unpredictableSeed; 387 388 usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0); 389 390 if (usingUnpredictableSeed) seed = unpredictableSeed; 391 else if (seedValueOptionArg != 0) seed = seedValueOptionArg; 392 else if (staticSeed) seed = 2438424139; 393 else assert(0, "Internal error, invalid seed option states."); 394 395 /* Maximum number of open files. Mainly applies when --num-files is used. 396 * 397 * Derive maxOpenOutputFiles. Inputs: 398 * - Internal default limit: 4096. This is a somewhat conservative setting. 399 * - rlimit open files limit. Defined by '$ ulimit -n'. 400 * - '--max-open-files' (maxOpenFilesArg). This adjusts the internal limit, 401 * but only up to the rlimit value. 402 * - Four open files are reserved for stdin, stdout, stderr, and one input 403 * file. 404 */ 405 406 immutable uint internalDefaultMaxOpenFiles = 4096; 407 immutable uint numReservedOpenFiles = 4; 408 immutable uint rlimitOpenFilesLimit = rlimitCurrOpenFilesLimit(); 409 410 enforce(maxOpenFilesArg == 0 || maxOpenFilesArg > numReservedOpenFiles, 411 format("'--max-open-files' must be at least %d.", 412 numReservedOpenFiles + 1)); 413 414 enforce(maxOpenFilesArg <= rlimitOpenFilesLimit, 415 format("'--max-open-files' value (%d) greater current system limit (%d)." ~ 416 "\nRun 'ulimit -n' to see the soft limit." ~ 417 "\nRun 'ulimit -Hn' to see the hard limit." ~ 418 "\nRun 'ulimit -Sn NUM' to change the soft limit.", 419 maxOpenFilesArg, rlimitOpenFilesLimit)); 420 421 enforce(rlimitOpenFilesLimit > numReservedOpenFiles, 422 format("System open file limit too small. Current value: %d. Must be %d or more." ~ 423 "\nRun 'ulimit -n' to see the soft limit." ~ 424 "\nRun 'ulimit -Hn' to see the hard limit." ~ 425 "\nRun 'ulimit -Sn NUM' to change the soft limit.", 426 rlimitOpenFilesLimit, numReservedOpenFiles + 1)); 427 428 immutable uint openFilesLimit = 429 (maxOpenFilesArg != 0) 430 ? maxOpenFilesArg 431 : min(internalDefaultMaxOpenFiles, rlimitOpenFilesLimit); 432 433 assert(openFilesLimit > numReservedOpenFiles); 434 435 maxOpenOutputFiles = openFilesLimit - numReservedOpenFiles; 436 437 /* Suffix - If not provided, use the extension of the first input file. 438 * No suffix if reading from standard input. 439 */ 440 if (suffix == invalidFileSuffix) suffix = filepaths[0].extension; 441 442 /* Ensure forward slash is not included in the filename prefix and suffix. 443 * Forward slash is an invalid Unix filename character. However, open file 444 * calls could match a directory path, resulting in unintended file 445 * creation. 446 * 447 * The other invalid filename character on Unix is the NULL character. 448 * However, the NULL character cannot be entered via Unix command lines, 449 * so there is no need to test for it explicitly. 450 */ 451 enforce(!prefix.canFind('/'), 452 "'--prefix' cannot contain forward slash characters. Use '--dir' to specify an output directory."); 453 454 enforce(!suffix.canFind('/'), 455 "'--suffix' cannot contain forward slash characters. Use '--dir' to specify an output directory."); 456 457 /* Digit width - If not specified, or specified as zero, the width is 458 * determined by the number of files for --num-files, or defaulted to 3 459 * for --lines-per-file. 460 */ 461 if (digitWidth == 0) 462 { 463 if (numFiles > 0) 464 { 465 digitWidth = 1; 466 uint n = numFiles - 1; 467 while (n >= 10) 468 { 469 n /= 10; 470 ++digitWidth; 471 } 472 } 473 else 474 { 475 digitWidth = 3; 476 } 477 } 478 assert(digitWidth != 0); 479 480 /* 481 * Create the inputSourceRange and perform header line processing. 482 */ 483 ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader; 484 inputSources = inputSourceRange(filepaths, readHeader); 485 486 string[] headerFields; 487 488 if (hasHeader) headerFields = inputSources.front.header.split(delim).to!(string[]); 489 490 if (!keyFieldsArg.empty) 491 { 492 keyFields = 493 keyFieldsArg 494 .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero) 495 (hasHeader, headerFields, keyFieldsOptionString) 496 .array; 497 } 498 499 if (keyFields.length > 0) 500 { 501 if (keyFields.length == 1 && keyFields[0] == 0) 502 { 503 keyIsFullLine = true; 504 } 505 else 506 { 507 enforce(keyFields.all!(x => x != 0), 508 "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields."); 509 510 keyFields.each!((ref x) => --x); // Convert to zero-based indexing. 511 } 512 } 513 514 } 515 catch (Exception exc) 516 { 517 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 518 return tuple(false, 1); 519 } 520 return tuple(true, 0); 521 } 522 } 523 524 /* TsvSplitOptions unit tests (command-line argument processing). 525 * 526 * Basic tests. Many cases are covered in executable tests, including all error cases, 527 * as errors write to stderr. 528 */ 529 unittest 530 { 531 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 532 import std.conv : to; 533 import std.file : mkdir, rmdirRecurse; 534 import std.path : buildPath; 535 536 /* A pair of dummy files are used so we don't have to worry about the cases where 537 * command line processing might open a file. Don't want to use standard input for 538 * this, at least in cases where it might try to read to get the header line. 539 * 540 * Note: For Windows we need to ensure there are no references held to the dummy 541 * file (somefile.txt) by the time rmdirRecurse tries to remove it. So we take 542 * a step not necessary in normal code and explicitly empty the inputSources in 543 * TsvSplitOptions structs that are created during the tests. In normal code, 544 * this happens when the input sources are iterated, but the sources are not 545 * iterated in these tests. 546 */ 547 auto testDir = makeUnittestTempDir("tsv_split_bylinecount"); 548 scope(exit) testDir.rmdirRecurse; 549 550 string somefile_txt = buildPath(testDir, "somefile.txt"); 551 string anotherfile_pqr = buildPath(testDir, "anotherfile.pqr"); 552 553 { 554 auto f1 = somefile_txt.File("wb"); 555 f1.writeln("Hello World!"); 556 f1.close; 557 558 auto f2 = anotherfile_pqr.File("wb"); 559 f2.writeln("Good Morning World!"); 560 f2.close; 561 } 562 563 { 564 auto args = ["unittest", "--lines-per-file", "10", somefile_txt]; 565 TsvSplitOptions cmdopt; 566 const r = cmdopt.processArgs(args); 567 568 assert(cmdopt.linesPerFile == 10); 569 assert(cmdopt.keyFields.empty); 570 assert(cmdopt.numFiles == 0); 571 assert(cmdopt.hasHeader == false); 572 573 while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront; 574 } 575 { 576 auto args = ["unittest", "--num-files", "20", somefile_txt]; 577 TsvSplitOptions cmdopt; 578 const r = cmdopt.processArgs(args); 579 580 assert(cmdopt.linesPerFile == 0); 581 assert(cmdopt.keyFields.empty); 582 assert(cmdopt.numFiles == 20); 583 assert(cmdopt.hasHeader == false); 584 585 while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront; 586 } 587 { 588 auto args = ["unittest", "-n", "5", "--key-fields", "1-3", somefile_txt]; 589 TsvSplitOptions cmdopt; 590 const r = cmdopt.processArgs(args); 591 592 assert(cmdopt.linesPerFile == 0); 593 assert(cmdopt.keyFields == [0, 1, 2]); 594 assert(cmdopt.numFiles == 5); 595 assert(cmdopt.hasHeader == false); 596 assert(cmdopt.keyIsFullLine == false); 597 598 while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront; 599 } 600 { 601 auto args = ["unittest", "-n", "5", "-k", "0", somefile_txt]; 602 TsvSplitOptions cmdopt; 603 const r = cmdopt.processArgs(args); 604 605 assert(cmdopt.linesPerFile == 0); 606 assert(cmdopt.numFiles == 5); 607 assert(cmdopt.hasHeader == false); 608 assert(cmdopt.keyIsFullLine == true); 609 610 while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront; 611 } 612 { 613 auto args = ["unittest", "-n", "2", "--header", somefile_txt]; 614 TsvSplitOptions cmdopt; 615 const r = cmdopt.processArgs(args); 616 617 assert(cmdopt.headerInOut == true); 618 assert(cmdopt.hasHeader == true); 619 assert(cmdopt.headerIn == false); 620 621 while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront; 622 } 623 { 624 auto args = ["unittest", "-n", "2", "--header-in-only", somefile_txt]; 625 TsvSplitOptions cmdopt; 626 const r = cmdopt.processArgs(args); 627 628 assert(cmdopt.headerInOut == false); 629 assert(cmdopt.hasHeader == true); 630 assert(cmdopt.headerIn == true); 631 632 while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront; 633 } 634 635 static void testSuffix(string[] args, string expectedSuffix) 636 { 637 TsvSplitOptions cmdopt; 638 auto savedArgs = args.to!string; 639 const r = cmdopt.processArgs(args); 640 641 assert(r[0], format("[testSuffix] cmdopt.processArgs(%s) returned false.", savedArgs)); 642 assert(cmdopt.suffix == expectedSuffix, 643 format("[testSuffix] Incorrect cmdopt.suffix. Expected: '%s', Actual: '%s'\n cmdopt.processArgs(%s)", 644 expectedSuffix, cmdopt.suffix, savedArgs)); 645 646 while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront; 647 } 648 649 /* In these tests, don't use headers and when files are listed, use 'somefile_txt' first. 650 * This makes sure there is no attempt to read standard input and that there won't be an 651 * open failure trying to find a file. 652 */ 653 testSuffix(["unittest", "-n", "2"], ""); 654 testSuffix(["unittest", "-n", "2", "--", "-"], ""); 655 testSuffix(["unittest", "-n", "2", "--suffix", "_123"], "_123"); 656 testSuffix(["unittest", "-n", "2", somefile_txt], ".txt"); 657 testSuffix(["unittest", "-n", "2", somefile_txt, anotherfile_pqr], ".txt"); 658 testSuffix(["unittest", "-n", "2", "--suffix", ".X", somefile_txt, anotherfile_pqr], ".X"); 659 testSuffix(["unittest", "-n", "2", "--suffix", "", somefile_txt], ""); 660 testSuffix(["unittest", "-n", "2", "--", "-", somefile_txt], ""); 661 testSuffix(["unittest", "-n", "2", "--", somefile_txt, "-"], ".txt"); 662 663 static void testDigitWidth(string[] args, uint expected) 664 { 665 TsvSplitOptions cmdopt; 666 auto savedArgs = args.to!string; 667 const r = cmdopt.processArgs(args); 668 669 assert(r[0], format("[testDigitWidth] cmdopt.processArgs(%s) returned false.", savedArgs)); 670 assert(cmdopt.digitWidth == expected, 671 format("[testDigitWidth] Incorrect cmdopt.digitWidth. Expected: %d, Actual: %d\n cmdopt.processArgs(%s)", 672 expected, cmdopt.digitWidth, savedArgs)); 673 674 while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront; 675 } 676 677 testDigitWidth(["unittest", "-n", "2", somefile_txt], 1); 678 testDigitWidth(["unittest", "-n", "2", "--digit-width" , "0", somefile_txt], 1); 679 testDigitWidth(["unittest", "-n", "10", somefile_txt], 1); 680 testDigitWidth(["unittest", "-n", "11", somefile_txt], 2); 681 testDigitWidth(["unittest", "-n", "555", somefile_txt], 3); 682 testDigitWidth(["unittest", "-n", "555", "--digit-width" , "2", somefile_txt], 2); 683 testDigitWidth(["unittest", "-n", "555", "--digit-width" , "4", somefile_txt], 4); 684 testDigitWidth(["unittest", "-l", "10", somefile_txt], 3); 685 testDigitWidth(["unittest", "-l", "10000", somefile_txt], 3); 686 testDigitWidth(["unittest", "-l", "10000", "--digit-width", "0", somefile_txt], 3); 687 testDigitWidth(["unittest", "-l", "10000", "--digit-width", "1", somefile_txt], 1); 688 testDigitWidth(["unittest", "-l", "10000", "--digit-width", "5", somefile_txt], 5); 689 } 690 691 /** Get the rlimit current number of open files the process is allowed. 692 * 693 * This routine returns the current soft limit on the number of open files the process 694 * is allowed. This is the number returned by the command: '$ ulimit -n'. 695 * 696 * This routine translates this value to a 'uint', as tsv-split uses 'uint' for 697 * tracking output files. The rlimit 'rlim_t' type is usually 'ulong' or 'long'. 698 * RLIM_INFINITY and any value larger than 'uint.max' is translated to 'uint.max'. 699 * 700 * An exception is thrown if call to 'getrlimit' fails. 701 * 702 * Note about Windows: rlimit is a Posix construct, not available on Windows. 703 * Currently, tsv-split is written for Posix. To allow it compile on Windows, this 704 * routine returns 512 on Windows, which is the default for Windows stream I/O. This 705 * is a stop-gap solution. A more generalized 'systemCurrOpenFilesLimit' would make 706 * sense if Windows becomes primary platform. That would also require changing error 707 * messages, help, etc., to be platform specfic. At present, testing is done only on 708 * Posix platforms. For info on Windows stream I/O limits see: 709 * https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/setmaxstdio 710 */ 711 uint rlimitCurrOpenFilesLimit() 712 { 713 version (Posix) 714 { 715 import core.sys.posix.sys.resource : 716 rlim_t, rlimit, getrlimit, RLIMIT_NOFILE, RLIM_INFINITY, RLIM_SAVED_CUR; 717 import std.conv : to; 718 719 uint currOpenFileLimit = uint.max; 720 721 rlimit rlimitMaxOpenFiles; 722 723 enforce(getrlimit(RLIMIT_NOFILE, &rlimitMaxOpenFiles) == 0, 724 "Internal error: getrlimit call failed"); 725 726 if (rlimitMaxOpenFiles.rlim_cur != RLIM_INFINITY && 727 rlimitMaxOpenFiles.rlim_cur != RLIM_SAVED_CUR && 728 rlimitMaxOpenFiles.rlim_cur >= 0 && 729 rlimitMaxOpenFiles.rlim_cur <= uint.max) 730 { 731 currOpenFileLimit = rlimitMaxOpenFiles.rlim_cur.to!uint; 732 } 733 734 return currOpenFileLimit; 735 } 736 else version (Windows) 737 { 738 return 512; 739 } 740 else 741 { 742 static assert(0, "Unsupported platform."); 743 } 744 } 745 746 /** Invokes the proper split routine based on the command line arguments. 747 * 748 * This routine is the top-level control after command line argument processing is 749 * done. It's primary job is to set up data structures and invoke the correct 750 * processing routine based on the command line arguments. 751 */ 752 void tsvSplit(ref TsvSplitOptions cmdopt) 753 { 754 /* Check that the input files were setup as expected. Should at least have one 755 * input, stdin if nothing else. */ 756 assert(!cmdopt.inputSources.empty); 757 758 if (cmdopt.linesPerFile != 0) 759 { 760 splitByLineCount(cmdopt); 761 } 762 else 763 { 764 /* Randomly distribute input lines to a specified number of files. */ 765 766 auto outputFiles = 767 SplitOutputFiles(cmdopt.numFiles, cmdopt.dir, cmdopt.prefix, cmdopt.suffix, 768 cmdopt.digitWidth, cmdopt.headerInOut, cmdopt.maxOpenOutputFiles, 769 cmdopt.inputSources.front.header); 770 771 if (!cmdopt.appendToExistingFiles) 772 { 773 string existingFile = outputFiles.checkIfFilesExist; 774 enforce(existingFile.length == 0, 775 format("One or more output files already exist. Use '--a|append' to append to existing files. File: '%s'.", 776 existingFile)); 777 } 778 779 if (cmdopt.keyFields.length == 0) 780 { 781 splitLinesRandomly(cmdopt, outputFiles); 782 } 783 else 784 { 785 splitLinesByKey(cmdopt, outputFiles); 786 } 787 } 788 } 789 790 /** A SplitOutputFiles struct holds a collection of output files. 791 * 792 * This struct manages a collection of output files used when writing to multiple 793 * files at once. This includes constructing filenames, opening and closing files, 794 * and writing data and header lines. 795 * 796 * Both random assignment (splitLinesRandomly) and random assignment by key 797 * (splitLinesByKey) use a SplitOutputFiles struct to manage output files. 798 * 799 * The main properties of the output file set are specified in the constuctor. The 800 * exception is the header line. This is not known until the first input file is 801 * read, so it is specified in a separate 'setHeader' call. 802 * 803 * Individual output files are written to based on their zero-based index in the 804 * output collection. The caller selects the output file number to write to and 805 * calls 'writeDataLine' to write a line. The header is written if needed. 806 */ 807 struct SplitOutputFiles 808 { 809 import std.conv : to; 810 import std.file : exists; 811 import std.path : buildPath; 812 import std.stdio : File; 813 814 static struct OutputFile 815 { 816 string filename; 817 File ofile; 818 bool hasData; 819 bool isOpen; // Track separately due to https://github.com/dlang/phobos/pull/7397 820 } 821 822 private uint _numFiles; 823 private bool _writeHeaders; 824 private uint _maxOpenFiles; 825 826 private OutputFile[] _outputFiles; 827 private uint _numOpenFiles = 0; 828 private string _header; 829 830 this(uint numFiles, string dir, string filePrefix, string fileSuffix, 831 uint fileDigitWidth, bool writeHeaders, uint maxOpenFiles, string header) 832 { 833 assert(numFiles >= 2); 834 assert(maxOpenFiles >= 1); 835 836 _numFiles = numFiles; 837 _writeHeaders = writeHeaders; 838 _maxOpenFiles = maxOpenFiles; 839 _header = header; 840 841 _outputFiles.length = numFiles; 842 843 /* Filename assignment. */ 844 foreach (i, ref f; _outputFiles) 845 { 846 f.filename = 847 buildPath(dir, format("%s%.*d%s", filePrefix, fileDigitWidth, i, fileSuffix)); 848 } 849 } 850 851 /* Destructor ensures all files are closed. 852 * 853 * Note: A dual check on whether the file is open is made. This is to avoid a 854 * Phobos bug where std.File doesn't properly maintain the state of open files 855 * if the File.open call fails. See: https://github.com/dlang/phobos/pull/7397. 856 */ 857 ~this() 858 { 859 foreach (ref f; _outputFiles) 860 { 861 if (f.isOpen && f.ofile.isOpen) 862 { 863 assert(_numOpenFiles >= 1); 864 865 f.ofile.close; 866 f.isOpen = false; 867 _numOpenFiles--; 868 } 869 } 870 } 871 872 /* Check if any of the files already exist. 873 * 874 * Returns the empty string if none of the files exist. Otherwise returns the 875 * filename of the first existing file found. This is to facilitate error 876 * message generation. 877 */ 878 string checkIfFilesExist() 879 { 880 foreach (f; _outputFiles) if (f.filename.exists) return f.filename; 881 return ""; 882 } 883 884 /* Picks a random file to close. Used when the open file handle limit has been 885 * reached. 886 */ 887 private void closeSomeFile() 888 { 889 import std.random : uniform; 890 assert(_numOpenFiles > 0); 891 892 immutable uint start = uniform(0, _numFiles); 893 894 foreach (i; cycle(iota(_numFiles), start).take(_numFiles)) 895 { 896 if (_outputFiles[i].isOpen) 897 { 898 _outputFiles[i].ofile.close; 899 _outputFiles[i].isOpen = false; 900 _numOpenFiles--; 901 902 return; 903 } 904 } 905 906 assert(false, "[SplitOutputFiles.closeSomeFile]: Could not find file to close."); 907 } 908 909 /* Write a line to the specified file number. 910 * 911 * A header is written to the file if headers are being written and this is the 912 * first data written to the file. 913 */ 914 void writeDataLine(uint fileNum, const char[] data) 915 { 916 assert(fileNum < _numFiles); 917 assert(fileNum < _outputFiles.length); 918 assert(_numOpenFiles <= _maxOpenFiles); 919 920 OutputFile* outputFile = &_outputFiles[fileNum]; 921 922 if (!outputFile.isOpen) 923 { 924 if (_numOpenFiles == _maxOpenFiles) closeSomeFile(); 925 assert(_numOpenFiles < _maxOpenFiles); 926 927 outputFile.ofile = outputFile.filename.File("ab"); 928 outputFile.isOpen = true; 929 _numOpenFiles++; 930 931 if (!outputFile.hasData) 932 { 933 ulong filesize = outputFile.ofile.size; 934 outputFile.hasData = (filesize > 0 && filesize != ulong.max); 935 } 936 } 937 938 if (_writeHeaders && !outputFile.hasData) outputFile.ofile.writeln(_header); 939 940 outputFile.ofile.writeln(data); 941 outputFile.hasData = true; 942 } 943 } 944 945 /** Write input lines to multiple files, randomly selecting an output file for each line. 946 */ 947 void splitLinesRandomly(ref TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles) 948 { 949 import std.random : Random = Mt19937, uniform; 950 import tsv_utils.common.utils : bufferedByLine, InputSourceRange; 951 952 /* inputSources must be an InputSourceRange and include at least stdin. */ 953 assert(!cmdopt.inputSources.empty); 954 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 955 956 auto randomGenerator = Random(cmdopt.seed); 957 958 /* Process each line. */ 959 foreach (inputStream; cmdopt.inputSources) 960 { 961 foreach (line; inputStream.file.bufferedByLine) 962 { 963 immutable uint outputFileNum = uniform(0, cmdopt.numFiles, randomGenerator); 964 outputFiles.writeDataLine(outputFileNum, line); 965 } 966 } 967 } 968 969 /** Write input lines to multiple output files using fields as a random selection key. 970 * 971 * Each input line is written to an output file. The output file is chosen using 972 * fields as a key. Each unique key is assigned to a file. All lines having the 973 * same key are written to the same file. 974 */ 975 void splitLinesByKey(ref TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles) 976 { 977 import std.algorithm : splitter; 978 import std.conv : to; 979 import std.digest.murmurhash; 980 import tsv_utils.common.utils : bufferedByLine, InputFieldReordering, 981 InputSourceRange, throwIfWindowsNewline; 982 983 assert(cmdopt.keyFields.length > 0); 984 985 /* inputSources must be an InputSourceRange and include at least stdin. */ 986 assert(!cmdopt.inputSources.empty); 987 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 988 989 immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys. 990 991 /* Create a mapping for the key fields. */ 992 auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields); 993 994 /* Process each line. */ 995 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 996 foreach (inputStream; cmdopt.inputSources) 997 { 998 if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1); 999 1000 foreach (fileLineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine)) 1001 { 1002 if (fileLineNum == 1) throwIfWindowsNewline(line, inputStream.name, fileLineNum); 1003 1004 /* Murmurhash works by successively adding individual keys, then finalizing. 1005 * Adding individual keys is simpler if the full-line-as-key and individual 1006 * fields as keys cases are separated. 1007 */ 1008 auto hasher = MurmurHash3!32(cmdopt.seed); 1009 1010 if (cmdopt.keyIsFullLine) 1011 { 1012 hasher.put(cast(ubyte[]) line); 1013 } 1014 else 1015 { 1016 assert(keyFieldsReordering !is null); 1017 1018 /* Gather the key field values and assemble the key. */ 1019 keyFieldsReordering.initNewLine; 1020 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 1021 { 1022 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 1023 if (keyFieldsReordering.allFieldsFilled) break; 1024 } 1025 1026 enforce(keyFieldsReordering.allFieldsFilled, 1027 format("Not enough fields in line. File: %s, Line: %s", 1028 inputStream.name, fileLineNum)); 1029 1030 foreach (count, key; keyFieldsReordering.outputFields.enumerate) 1031 { 1032 if (count > 0) hasher.put(delimArray); 1033 hasher.put(cast(ubyte[]) key); 1034 } 1035 } 1036 1037 hasher.finish; 1038 immutable uint outputFileNum = hasher.get % cmdopt.numFiles; 1039 outputFiles.writeDataLine(outputFileNum, line); 1040 } 1041 } 1042 } 1043 1044 /** Write input lines to multiple files, splitting based on line count. 1045 * 1046 * Note: readBufferSize is an argument primarily for unit test purposes. Normal uses 1047 * should use the default value. 1048 */ 1049 void splitByLineCount(ref TsvSplitOptions cmdopt, const size_t readBufferSize = 1024L * 128L) 1050 { 1051 import std.file : exists; 1052 import std.path : buildPath; 1053 import std.stdio : File; 1054 import tsv_utils.common.utils : InputSourceRange; 1055 1056 assert (readBufferSize > 0); 1057 ubyte[] readBuffer = new ubyte[readBufferSize]; 1058 1059 /* inputSources must be an InputSourceRange and include at least stdin. */ 1060 assert(!cmdopt.inputSources.empty); 1061 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 1062 1063 string header = !cmdopt.headerInOut ? "" : 1064 cmdopt.inputSources.front.header(Yes.keepTerminator); 1065 size_t nextOutputFileNum = 0; 1066 File outputFile; 1067 string outputFileName; 1068 bool isOutputFileOpen = false; // Open file status tracked separately due to phobos bugs 1069 size_t outputFileRemainingLines; 1070 1071 /* nextNewlineIndex finds the index of the next newline character. It is an 1072 * alternative to std.algorithm.countUntil. Invoking 'find' directly results 1073 * 'memchr' being used (faster). The current 'countUntil' implementation does 1074 * forward to find, but the way it is done avoids the memchr call optimization. 1075 */ 1076 static long nextNewlineIndex(const ubyte[] buffer) 1077 { 1078 import std.algorithm : find; 1079 immutable ubyte newlineChar = '\n'; 1080 immutable size_t buflen = buffer.length; 1081 immutable size_t findlen = buffer.find(newlineChar).length; 1082 1083 return findlen > 0 ? buflen - findlen : -1; 1084 } 1085 1086 foreach (inputStream; cmdopt.inputSources) 1087 { 1088 foreach (ref ubyte[] inputChunk; inputStream.file.byChunk(readBuffer)) 1089 { 1090 size_t nextOutputChunkStart = 0; 1091 auto remainingInputChunk = inputChunk[nextOutputChunkStart .. $]; 1092 1093 while (!remainingInputChunk.empty) 1094 { 1095 /* See if the next output file needs to be opened. */ 1096 if (!isOutputFileOpen) 1097 { 1098 outputFileName = 1099 buildPath(cmdopt.dir, 1100 format("%s%.*d%s", cmdopt.prefix, 1101 cmdopt.digitWidth, nextOutputFileNum, cmdopt.suffix)); 1102 1103 enforce(cmdopt.appendToExistingFiles || !outputFileName.exists, 1104 format("Output file already exists. Use '--a|append' to append to existing files. File: '%s'.", 1105 outputFileName)); 1106 1107 outputFile = outputFileName.File("ab"); 1108 outputFile.setvbuf(1024L * 64L, _IOFBF); 1109 isOutputFileOpen = true; 1110 ++nextOutputFileNum; 1111 outputFileRemainingLines = cmdopt.linesPerFile; 1112 1113 if (cmdopt.headerInOut) 1114 { 1115 ulong filesize = outputFile.size; 1116 if (filesize == 0 || filesize == ulong.max) outputFile.rawWrite(header); 1117 } 1118 } 1119 1120 /* Find more newlines for the current output file. */ 1121 1122 assert(outputFileRemainingLines > 0); 1123 1124 size_t nextOutputChunkEnd = nextOutputChunkStart; 1125 1126 while (outputFileRemainingLines != 0 && !remainingInputChunk.empty) 1127 { 1128 /* Note: newLineIndex is relative to 'remainingInputChunk', not 1129 * 'inputChunk'. Updates to variables referring to 'inputChunk' 1130 * need to reflect this. In particular, 'nextOutputChunkEnd'. 1131 */ 1132 immutable newlineIndex = nextNewlineIndex(remainingInputChunk); 1133 1134 if (newlineIndex == -1) 1135 { 1136 nextOutputChunkEnd = inputChunk.length; 1137 } 1138 else 1139 { 1140 --outputFileRemainingLines; 1141 nextOutputChunkEnd += (newlineIndex + 1); 1142 } 1143 1144 remainingInputChunk = inputChunk[nextOutputChunkEnd .. $]; 1145 } 1146 1147 assert(nextOutputChunkStart < nextOutputChunkEnd); 1148 assert(nextOutputChunkEnd <= inputChunk.length); 1149 1150 outputFile.rawWrite(inputChunk[nextOutputChunkStart .. nextOutputChunkEnd]); 1151 1152 if (outputFileRemainingLines == 0) 1153 { 1154 outputFile.close; 1155 isOutputFileOpen = false; 1156 } 1157 1158 nextOutputChunkStart = nextOutputChunkEnd; 1159 1160 assert(remainingInputChunk.length == inputChunk.length - nextOutputChunkStart); 1161 } 1162 } 1163 } 1164 } 1165 1166 /* splitByLineCount unit tests. 1167 * 1168 * These tests are primarily for buffer management. There are edge cases involving the 1169 * interaction buffer size, input file size, lines-per-file, and newline placement 1170 * that are difficult to test against the executable. 1171 */ 1172 unittest 1173 { 1174 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 1175 import std.algorithm : min; 1176 import std.array : appender; 1177 import std.conv : to; 1178 import std.file : exists, mkdir, rmdirRecurse; 1179 import std.path : buildPath; 1180 import std.process : escapeShellCommand, executeShell; 1181 1182 /* Test setup 1183 * 1184 * A set of twenty file input files is created, with names: input_NxM.txt, where 1185 * N is the number of characters in each row and M is the number of rows (lines). 1186 * The resulting files are put in the "lc_input" directory ('inputDir' variable) 1187 * and have names: 1188 * input_0x2.txt, input_0x3.txt, ... input_5x5.txt. 1189 * 1190 * A standalone block of code produces the expected result files for splitting an 1191 * input file into a set of output files. This duplicates the splitByLineCount 1192 * output. This is done for lines-per-file counts 1 to 5. Each result set is place 1193 * ina subdirectory under "lc_expected" ('expectedDir' variable). Subdirectories 1194 * have names like: "0x2_by_1", "0x3_by_1", ..., "5x5_by_4". 1195 * 1196 * splitByLine is called for all the same input files and lines-per-file settings used 1197 * to produce the expected output. This is done via testSplitByLineCount, which calls 1198 * command line argument processing and splitByLine, similar to how the main program 1199 * works. The results are written to a subdirectory. The subdirectory is compared to 1200 * the expected output directory using the system 'diff' command. 1201 * 1202 * splitByLine is multiple times for each expected output case. The different calls 1203 * iterate over a series of small ReadBufferSizes. This is how tests for edge cases 1204 * in the readBufferSize vs line lengths, newline placement, etc., is accomplished. 1205 * 1206 * Note: One way to understand what is going on is to comment out the line: 1207 * 1208 * scope(exit) testDir.rmdirRecurse; 1209 * 1210 * Then run the test (e.g. 'make test') and look at the directory structure left 1211 * behind. Print out the 'testDir' directory to see where it is located. 1212 */ 1213 1214 /* testSplitByLineCount acts as a surrogate for main() and tsvSplit(). It makes the 1215 * call to splitByLineCount and calls 'diff' to compare the output directory to the 1216 * expected directory. An assert is thrown if the directories do not match. 1217 */ 1218 static void testSplitByLineCount(string[] cmdArgs, string expectedDir, 1219 size_t readBufferSize = 1024L * 512L) 1220 { 1221 import std.array : appender; 1222 1223 assert(cmdArgs.length > 0, "[testSplitByLineCount] cmdArgs must not be empty."); 1224 1225 auto formatAssertMessage(T...)(string msg, T formatArgs) 1226 { 1227 auto formatString = "[testSplitByLineCount] %s: " ~ msg; 1228 return format(formatString, cmdArgs[0], formatArgs); 1229 } 1230 1231 TsvSplitOptions cmdopt; 1232 auto savedCmdArgs = cmdArgs.to!string; 1233 auto r = cmdopt.processArgs(cmdArgs); 1234 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 1235 assert(cmdopt.linesPerFile != 0, "[testSplitByLineCount] --lines-per-file is required."); 1236 assert(!cmdopt.dir.empty, "[testSplitByLineCount] --dir is required."); 1237 1238 splitByLineCount(cmdopt, readBufferSize); 1239 1240 /* Diff command setup. */ 1241 auto diffCmdArgs = ["diff", expectedDir, cmdopt.dir]; 1242 auto diffResult = executeShell(escapeShellCommand(diffCmdArgs)); 1243 assert(diffResult.status == 0, 1244 format("[testSplitByLineCount]\n cmd: %s\n readBufferSize: %d\n expectedDir: %s\n------ Diff ------%s\n-------", 1245 savedCmdArgs, readBufferSize, expectedDir, diffResult.output)); 1246 } 1247 1248 auto testDir = makeUnittestTempDir("tsv_split_bylinecount"); 1249 scope(exit) testDir.rmdirRecurse; 1250 1251 auto inputDir = buildPath(testDir, "lc_input"); 1252 auto outputDir = buildPath(testDir, "lc_output"); 1253 auto expectedDir = buildPath(testDir, "lc_expected"); 1254 1255 mkdir(inputDir); 1256 mkdir(outputDir); 1257 mkdir(expectedDir); 1258 1259 static string buildInputFilePath(string dir, long inputLineLength, long inputFileNumLines) 1260 { 1261 return buildPath(dir, format("input_%dx%d.txt", inputLineLength, inputFileNumLines)); 1262 } 1263 1264 string[5] outputRowData = 1265 [ 1266 "abcde", 1267 "fghij", 1268 "klmno", 1269 "pqrst", 1270 "uvwxy" 1271 ]; 1272 1273 /* The main test loop. Iterates over input line lengths, numbers of rows, 1274 * lines-per-file, and finally readBufferSize lengths. All combos are tested. 1275 */ 1276 foreach (inputLineLength; 0 .. 6) 1277 { 1278 foreach (inputFileNumLines; 2 .. 6) 1279 { 1280 auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines); 1281 1282 { 1283 auto ofile = inputFile.File("wb"); 1284 auto output = appender!(char[])(); 1285 foreach (m; 0 .. inputFileNumLines) 1286 { 1287 put(output, outputRowData[m][0 .. inputLineLength]); 1288 put(output, '\n'); 1289 } 1290 ofile.write(output.data); 1291 ofile.close; 1292 } 1293 1294 /* Iterate over the different lines-per-file lengths. 1295 * - Create an expected output directory and files for each. 1296 * - Test with different readBufferSize values. 1297 */ 1298 foreach (outputFileNumLines; 1 .. min(5, inputFileNumLines)) 1299 { 1300 auto expectedSubDir = 1301 buildPath(expectedDir, format("%dx%d_by_%d", inputLineLength, 1302 inputFileNumLines, outputFileNumLines)); 1303 mkdir(expectedSubDir); 1304 1305 size_t filenum = 0; 1306 size_t linesWritten = 0; 1307 while (linesWritten < inputFileNumLines) 1308 { 1309 auto expectedFile = buildPath(expectedSubDir, format("part_%d.txt", filenum)); 1310 auto f = expectedFile.File("wb"); 1311 auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten); 1312 foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite]) 1313 { 1314 f.writeln(line[0 .. inputLineLength]); 1315 } 1316 linesWritten += linesToWrite; 1317 ++filenum; 1318 f.close; 1319 } 1320 1321 /* Test the different readBufferSizes. 1322 * - An output directory is created for the run and deleted afterward. 1323 * - First test the default size. 1324 * - Then iterate overs small readBufferSize values. 1325 */ 1326 auto outputSubDir = 1327 buildPath(outputDir, format("%dx%d_by_%d", inputLineLength, 1328 inputFileNumLines, outputFileNumLines)); 1329 mkdir(outputSubDir); 1330 1331 testSplitByLineCount( 1332 ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir, 1333 "--digit-width", "1", inputFile], 1334 expectedSubDir); 1335 1336 outputSubDir.rmdirRecurse; 1337 1338 foreach (readBufSize; 1 .. 8) 1339 { 1340 mkdir(outputSubDir); 1341 1342 testSplitByLineCount( 1343 ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir, 1344 "--digit-width", "1", inputFile], 1345 expectedSubDir, readBufSize); 1346 1347 outputSubDir.rmdirRecurse; 1348 } 1349 } 1350 } 1351 } 1352 1353 { 1354 /* Tests for the special case where readBufferSize is smaller than the header 1355 * line. We'll reuse the input_5x4.txt input file and write 1 line-per-file. 1356 */ 1357 immutable inputLineLength = 5; 1358 immutable inputFileNumLines = 4; 1359 immutable outputFileNumLines = 1; 1360 1361 auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines); 1362 assert(inputFile.exists); 1363 1364 auto expectedSubDirHeader = 1365 buildPath(expectedDir, format("%dx%d_by_%d_header", inputLineLength, 1366 inputFileNumLines, outputFileNumLines)); 1367 1368 auto expectedSubDirHeaderInOnly = 1369 buildPath(expectedDir, format("%dx%d_by_%d_header_in_only", inputLineLength, 1370 inputFileNumLines, outputFileNumLines)); 1371 1372 mkdir(expectedSubDirHeader); 1373 mkdir(expectedSubDirHeaderInOnly); 1374 1375 /* Generate the expected results. Cheat by starting with linesWritten = 1. This 1376 * automatically excludes the header line, but keeps the loop code consistent 1377 * with the main test loop. 1378 */ 1379 size_t filenum = 0; 1380 size_t linesWritten = 1; 1381 while (linesWritten < inputFileNumLines) 1382 { 1383 auto expectedFileHeader = buildPath(expectedSubDirHeader, format("part_%d.txt", filenum)); 1384 auto expectedFileHeaderInOnly = buildPath(expectedSubDirHeaderInOnly, 1385 format("part_%d.txt", filenum)); 1386 auto fHeader = expectedFileHeader.File("wb"); 1387 auto fHeaderInOnly = expectedFileHeaderInOnly.File("wb"); 1388 auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten); 1389 1390 fHeader.writeln(outputRowData[0][0 .. inputLineLength]); 1391 foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite]) 1392 { 1393 fHeader.writeln(line[0 .. inputLineLength]); 1394 fHeaderInOnly.writeln(line[0 .. inputLineLength]); 1395 } 1396 linesWritten += linesToWrite; 1397 ++filenum; 1398 fHeader.close; 1399 fHeaderInOnly.close; 1400 } 1401 1402 /* Now run the tests. */ 1403 auto outputSubDirHeader = 1404 buildPath(outputDir, format("%dx%d_by_%d_header", inputLineLength, 1405 inputFileNumLines, outputFileNumLines)); 1406 auto outputSubDirHeaderInOnly = 1407 buildPath(outputDir, format("%dx%d_by_%d_header_in_only", inputLineLength, 1408 inputFileNumLines, outputFileNumLines)); 1409 1410 foreach (readBufSize; 1 .. 6) 1411 { 1412 mkdir(outputSubDirHeader); 1413 mkdir(outputSubDirHeaderInOnly); 1414 1415 testSplitByLineCount( 1416 ["test", "--header", "--lines-per-file", outputFileNumLines.to!string, 1417 "--dir", outputSubDirHeader, "--digit-width", "1", inputFile], 1418 expectedSubDirHeader, readBufSize); 1419 1420 testSplitByLineCount( 1421 ["test", "--header-in-only", "--lines-per-file", outputFileNumLines.to!string, 1422 "--dir", outputSubDirHeaderInOnly, "--digit-width", "1", inputFile], 1423 expectedSubDirHeaderInOnly, readBufSize); 1424 1425 outputSubDirHeader.rmdirRecurse; 1426 outputSubDirHeaderInOnly.rmdirRecurse; 1427 } 1428 } 1429 }