tsv_utils.tsv_split source code

1 /**
2 Command line tool for splitting a files (or files) into multiple output files.
3 Several methods for splitting are available, including splitting by line count,
4 splitting by random assignment, and splitting by random assignment based on
5 key fields.
6 
7 Copyright (c) 2020, eBay Inc.
8 Initially written by Jon Degenhardt
9 
10 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
11 */
12 module tsv_utils.tsv_split;
13 
14 import std.range;
15 import std.stdio;
16 import std.typecons : tuple, Flag;
17 
18 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
19 
20 version(unittest)
21 {
22     // When running unit tests, use main from -main compiler switch.
23 }
24 else
25 {
26     /** Main program.
27      *
28      * Invokes command line argument processing and calls tsvSplit to do the real
29      * work. Errors occurring during processing are caught and reported to the user.
30      */
31     int main(string[] cmdArgs)
32     {
33         /* When running in DMD code coverage mode, turn on report merging. */
34         version(D_Coverage) version(DigitalMars)
35         {
36             import core.runtime : dmd_coverSetMerge;
37             dmd_coverSetMerge(true);
38         }
39 
40         TsvSplitOptions cmdopt;
41         const r = cmdopt.processArgs(cmdArgs);
42         if (!r[0]) return r[1];
43         version(LDC_Profile)
44         {
45             import ldc.profile : resetAll;
46             resetAll();
47         }
48         try
49         {
50             tsvSplit(cmdopt);
51         }
52         catch (Exception exc)
53         {
54             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
55             return 1;
56         }
57         return 0;
58     }
59 }
60 
61 immutable helpText = q"EOS
62 Synopsis: tsv-split [options] [file...]
63 
64 Split input lines into multiple output files. There are three modes of
65 operation:
66 
67 * Fixed number of lines per file (--l|lines-per-file NUM): Each input
68   block of NUM lines is written to a new file. Similar to Unix 'split'.
69 
70 * Random assignment (--n|num-files NUM): Each input line is written to a
71   randomly selected output file. Random selection is from NUM files.
72 
73 * Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS):
74   Input lines are written to output files using fields as a key. Each
75   unique key is randomly assigned to one of NUM output files. All lines
76   with the same key are written to the same file.
77 
78 By default, files are written to the current directory and have names
79 of the form 'part_NNN<suffix>', with 'NNN' being a number and <suffix>
80 being the extension of the first input file. If the input file is
81 'file.txt', the names will take the form 'part_NNN.txt'. The output
82 directory and file names are customizable.
83 
84 Use '--help-verbose' for more detailed information.
85 
86 Options:
87 EOS";
88 
89 immutable helpTextVerbose = q"EOS
90 Synopsis: tsv-split [options] [file...]
91 
92 Split input lines into multiple output files. There are three modes of
93 operation:
94 
95 * Fixed number of lines per file (--l|lines-per-file NUM): Each input
96   block of NUM lines is written to a new file. Similar to Unix 'split'.
97 
98 * Random assignment (--n|num-files NUM): Each input line is written to a
99   randomly selected output file. Random selection is from NUM files.
100 
101 * Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS):
102   Input lines are written to output files using fields as a key. Each
103   unique key is randomly assigned to one of NUM output files. All lines
104   with the same key are written to the same file.
105 
106 Output files: By default, files are written to the current directory and
107 have names of the form 'part_NNN<suffix>', with 'NNN' being a number and
108 <suffix> being the extension of the first input file. If the input file is
109 'file.txt', the names will take the form 'part_NNN.txt'. The suffix is
110 empty when reading from standard input. The numeric part defaults to 3
111 digits for '--l|lines-per-files'. For '--n|num-files' enough digits are
112 used so all filenames are the same length. The output directory and file
113 names are customizable.
114 
115 Header lines: There are two ways to handle input with headers: write a
116 header to all output files (--H|header), or exclude headers from all
117 output files ('--I|header-in-only'). The best choice depends on the
118 follow-up processing. All tsv-utils tools support header lines in multiple
119 input files, but many other tools do not. For example, GNU parallel works
120 best on files without header lines.
121 
122 Random assignment (--n|num-files): Random distribution of records to a set
123 of files is a common task. When data fits in memory the preferred approach
124 is usually to shuffle the data and split it into fixed sized blocks. E.g.
125 'tsv-sample data.tsv | tsv-split -l NUM'. However, alternate approaches
126 are needed when data is too large for convenient shuffling. tsv-split's
127 random assignment feature is useful in this case. Each input line is
128 written to a randomly selected output file. Note that output files will
129 have similar but not identical numbers of records.
130 
131 Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): This
132 splits a data set into multiple files sharded by key. All lines with the
133 same key are written to the same file. This partitioning enables parallel
134 computation based on the key. For example, statistical calculation
135 ('tsv-summarize --group-by') or duplicate removal ('tsv-uniq --fields').
136 These operations can be parallelized using tools like GNU parallel, which
137 simplifies concurrent operations on multiple files.
138 
139 Random seed: By default, each tsv-split invocation using random assignment
140 or random assignment by key produces different assignments to the output
141 files. Using '--s|static-seed' changes this so multiple runs produce the
142 same assignments. This works by using the same random seed each run. The
143 seed can be specified using '--v|seed-value'.
144 
145 Appending to existing files: By default, an error is triggered if an
146 output file already exists. '--a|append' changes this so that lines are
147 appended to existing files. (Header lines are not appended to files with
148 data.) This is useful when adding new data to files created by a previous
149 tsv-split run. Random assignment should use the same '--n|num-files' value
150 each run, but different random seeds (avoid '--s|static-seed'). Random
151 assignment by key should use the same '--n|num-files', '--k|key-fields',
152 and seed ('--s|static-seed' or '--v|seed-value') each run.
153 
154 Max number of open files: Random assignment and random assignment by key
155 are dramatically faster when all output files are kept open. However,
156 keeping a large numbers of open files can bump into system limits or limit
157 resources available to other processes. By default, tsv-split uses up to
158 4096 open files or the system per-process limit, whichever is smaller.
159 This can be changed using '--max-open-files', though it cannot be set
160 larger than the system limit. The system limit varies considerably between
161 systems. On many systems it is unlimited. On MacOS it is often set to 256.
162 Use Unix 'ulimit' to display and modify the limits:
163 * 'ulimit -n' - Show the "soft limit". The per-process maximum.
164 * 'ulimit -Hn' - Show the "hard limit". The max allowed soft limit.
165 * 'ulimit -Sn NUM' - Change the "soft limit" to NUM.
166 
167 Examples:
168 
169   # Split a 10 million line file into 1000 files, 10,000 lines each.
170   # Output files are part_000.txt, part_001.txt, ... part_999.txt.
171   tsv-split data.txt --lines-per-file 10000
172 
173   # Same as the previous example, but write files to a subdirectory.
174   tsv-split data.txt --dir split_files --lines-per-file 10000
175 
176   # Split a file into 10,000 line files, writing a header line to each
177   tsv-split data.txt -H --lines-per-file 10000
178 
179   # Same as the previous example, but dropping the header line.
180   tsv-split data.txt -I --lines-per-file 10000
181 
182   # Randomly assign lines to 1000 files
183   tsv-split data.txt --num-files 1000
184 
185   # Randomly assign lines to 1000 files while keeping unique keys from
186   # field 3 together.
187   tsv-split data.tsv --num-files 1000 -k 3
188 
189   # Randomly assign lines to 1000 files. Later, randomly assign lines
190   # from a second data file to the same output files.
191   tsv-split data1.tsv -n 1000
192   tsv-split data2.tsv -n 1000 --append
193 
194   # Randomly assign lines to 1000 files using field 3 as a key.
195   # Later, add a second file to the same output files.
196   tsv-split data1.tsv -n 1000 -k 3 --static-seed
197   tsv-split data2.tsv -n 1000 -k 3 --static-seed --append
198 
199   # Change the system per-process open file limit for one command.
200   # The parens create a sub-shell. The current shell is not changed.
201   ( ulimit -Sn 1000 && tsv-split --num-files 1000 data.txt )
202 
203 Options:
204 EOS";
205 
206 /** Container for command line options and derived data.
207  *
208  * TsvSplitOptions handles several aspects of command line options. On the input side,
209  * it defines the command line options available, performs validation, and sets up any
210  * derived state based on the options provided. These activities are handled by the
211  * processArgs() member.
212  *
213  * Once argument processing is complete, TsvSplitOptions is used as a container
214  * holding the specific processing options used by the splitting algorithms.
215  */
216 struct TsvSplitOptions
217 {
218     enum invalidFileSuffix = "///////";
219 
220     string programName;                        /// Program name
221     string[] files;                            /// Input files
222     bool helpVerbose = false;                  /// --help-verbose
223     bool headerInOut = false;                  /// --H|header
224     bool headerIn = false;                     /// --I|header-in-only
225     size_t linesPerFile = 0;                   /// --l|lines-per-file
226     uint numFiles = 0;                         /// --n|num-files
227     size_t[] keyFields;                        /// --k|key-fields
228     string dir;                                /// --dir
229     string prefix = "part_";                   /// --prefix
230     string suffix = invalidFileSuffix;         /// --suffix
231     uint digitWidth = 0;                       /// --w|digit-width
232     bool appendToExistingFiles = false;        /// --a|append
233     bool staticSeed = false;                   /// --s|static-seed
234     uint seedValueOptionArg = 0;               /// --v|seed-value
235     char delim = '\t';                         /// --d|delimiter
236     uint maxOpenFilesArg = 0;                  /// --max-open-files
237     bool versionWanted = false;                /// --V|version
238     bool hasHeader = false;                    /// Derived. True if either '--H|header' or '--I|header-in-only' is set.
239     bool keyIsFullLine = false;                /// Derived. True if '--f|fields 0' is specfied.
240     bool usingUnpredictableSeed = true;        /// Derived from --static-seed, --seed-value
241     uint seed = 0;                             /// Derived from --static-seed, --seed-value
242     uint maxOpenOutputFiles;                   /// Derived.
243 
244     /** Process tsv-split command line arguments.
245      *
246      * Defines the command line options, performs validation, and derives additional
247      * state. std.getopt.getopt is called to do the main option processing followed
248      * additional validation and derivation.
249      *
250      * Help text is printed to standard output if help was requested. Error text is
251      * written to stderr if invalid input is encountered.
252      *
253      * A tuple is returned. First value is true if command line arguments were
254      * successfully processed and execution should continue, or false if an error
255      * occurred or the user asked for help. If false, the second value is the
256      * appropriate exit code (0 or 1).
257      *
258      * Returning true (execution continues) means args have been validated and derived
259      * values calculated. Field indices will have been converted to zero-based.
260      */
261     auto processArgs(ref string[] cmdArgs)
262     {
263         import std.algorithm : any, canFind, each, min;
264         import std.file : exists, isDir;
265         import std.format : format;
266         import std.getopt;
267         import std.math : isNaN;
268         import std.path : baseName, expandTilde, extension, stripExtension;
269         import std.typecons : Yes, No;
270         import tsv_utils.common.utils : makeFieldListOptionHandler;
271 
272         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
273 
274         try
275         {
276             arraySep = ",";    // Use comma to separate values in command line options
277             auto r = getopt(
278                 cmdArgs,
279                 "help-verbose",    "     Print more detailed help.", &helpVerbose,
280 
281                 std.getopt.config.caseSensitive,
282                 "H|header",         "     Input files have a header line. Write the header to each output file.", &headerInOut,
283                 "I|header-in-only", "     Input files have a header line. Do not write the header to output files.", &headerIn,
284                 std.getopt.config.caseInsensitive,
285 
286                 "l|lines-per-file", "NUM  Number of lines to write to each output file (excluding the header line).", &linesPerFile,
287                 "n|num-files",      "NUM  Number of output files to generate.", &numFiles,
288                 "k|key-fields",     "<field-list>  Fields to use as key. Lines with the same key are written to the same output file. Use '--k|key-fields 0' to use the entire line as the key.",
289                 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
290 
291                 "dir",              "STR  Directory to write to. Default: Current working directory.", &dir,
292                 "prefix",           "STR  Filename prefix. Default: 'part_'", &prefix,
293                 "suffix",           "STR  Filename suffix. Default: First input file extension. None for standard input.", &suffix,
294                 "w|digit-width",    "NUM  Number of digits in filename numeric portion. Default: '--l|lines-per-file': 3. '--n|num-files': Chosen so filenames have the same length. '--w|digit-width 0' uses the default.", &digitWidth,
295                 "a|append",         "     Append to existing files.", &appendToExistingFiles,
296 
297                 "s|static-seed",    "     Use the same random seed every run.", &staticSeed,
298 
299                 std.getopt.config.caseSensitive,
300                 "v|seed-value",     "NUM  Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg,
301                 std.getopt.config.caseInsensitive,
302 
303                 "d|delimiter",      "CHR  Field delimiter.", &delim,
304                 "max-open-files",   "NUM  Maximum open file handles to use. Min of 5 required.", &maxOpenFilesArg,
305 
306                 std.getopt.config.caseSensitive,
307                 "V|version",        "     Print version information and exit.", &versionWanted,
308                 std.getopt.config.caseInsensitive,
309                 );
310 
311             if (r.helpWanted)
312             {
313                 defaultGetoptPrinter(helpText, r.options);
314                 return tuple(false, 0);
315             }
316             else if (helpVerbose)
317             {
318                 defaultGetoptPrinter(helpTextVerbose, r.options);
319                 return tuple(false, 0);
320             }
321             else if (versionWanted)
322             {
323                 import tsv_utils.common.tsvutils_version;
324                 writeln(tsvutilsVersionNotice("tsv-split"));
325                 return tuple(false, 0);
326             }
327 
328             /*
329              * Validation and derivations.
330              */
331 
332             if (linesPerFile == 0 && numFiles == 0)
333             {
334                 throw new Exception ("Either '--l|lines-per-file' or '--n|num-files' is required.");
335             }
336 
337             if (linesPerFile != 0 && numFiles != 0)
338             {
339                 throw new Exception ("'--l|lines-per-file' and '--n|num-files' cannot be used together.");
340             }
341 
342             if (linesPerFile != 0 && keyFields.length != 0)
343             {
344                 throw new Exception ("'--l|lines-per-file' and '--k|key-fields' cannot be used together.");
345             }
346 
347             if (numFiles == 1)
348             {
349                 throw new Exception("'--n|num-files must be two or more.");
350             }
351 
352             if (keyFields.length > 0)
353             {
354                 if (keyFields.length == 1 && keyFields[0] == 0)
355                 {
356                     keyIsFullLine = true;
357                 }
358                 else
359                 {
360                     if (keyFields.length > 1 && keyFields.any!(x => x == 0))
361                     {
362                         throw new Exception(
363                             "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.");
364                     }
365 
366                     keyFields.each!((ref x) => --x);  // Convert to zero-based indexing.
367                 }
368             }
369 
370             if (headerInOut && headerIn)
371             {
372                 throw new Exception("Use only one of '--H|header' and '--I|header-in-only'.");
373             }
374 
375             hasHeader = headerInOut || headerIn;
376 
377             if (!dir.empty)
378             {
379                 dir = dir.expandTilde;
380                 if (!dir.exists) throw new Exception(format("Directory does not exist: --dir '%s'", dir));
381                 else if (!dir.isDir) throw new Exception(format("Path is not a directory: --dir '%s'", dir));
382             }
383 
384             /* Seed. */
385             import std.random : unpredictableSeed;
386 
387             usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0);
388 
389             if (usingUnpredictableSeed) seed = unpredictableSeed;
390             else if (seedValueOptionArg != 0) seed = seedValueOptionArg;
391             else if (staticSeed) seed = 2438424139;
392             else assert(0, "Internal error, invalid seed option states.");
393 
394             /* Maximum number of open files. Mainly applies when --num-files is used.
395              *
396              * Derive maxOpenOutputFiles. Inputs:
397              * - Internal default limit: 4096. This is a somewhat conservative setting.
398              * - rlimit open files limit. Defined by '$ ulimit -n'.
399              * - '--max-open-files' (maxOpenFilesArg). This adjusts the internal limit,
400              *   but only up to the rlimit value.
401              * - Four open files are reserved for stdin, stdout, stderr, and one input
402              *   file.
403              */
404 
405             immutable uint internalDefaultMaxOpenFiles = 4096;
406             immutable uint numReservedOpenFiles = 4;
407             immutable uint rlimitOpenFilesLimit = rlimitCurrOpenFilesLimit();
408 
409             if (maxOpenFilesArg != 0 && maxOpenFilesArg <= numReservedOpenFiles)
410             {
411                 throw new Exception(
412                     format("'--max-open-files' must be at least %d.",
413                            numReservedOpenFiles + 1));
414             }
415 
416             if (maxOpenFilesArg > rlimitOpenFilesLimit)
417             {
418                 throw new Exception(
419                     format("'--max-open-files' value (%d) greater current system limit (%d)." ~
420                            "\nRun 'ulimit -n' to see the soft limit." ~
421                            "\nRun 'ulimit -Hn' to see the hard limit." ~
422                            "\nRun 'ulimit -Sn NUM' to change the soft limit.",
423                            maxOpenFilesArg, rlimitOpenFilesLimit));
424             }
425 
426             if (rlimitOpenFilesLimit <= numReservedOpenFiles)
427             {
428                 throw new Exception(
429                     format("System open file limit too small. Current value: %d. Must be %d or more." ~
430                            "\nRun 'ulimit -n' to see the soft limit." ~
431                            "\nRun 'ulimit -Hn' to see the hard limit." ~
432                            "\nRun 'ulimit -Sn NUM' to change the soft limit.",
433                            rlimitOpenFilesLimit, numReservedOpenFiles + 1));
434             }
435 
436             immutable uint openFilesLimit =
437                 (maxOpenFilesArg != 0)
438                 ? maxOpenFilesArg
439                 : min(internalDefaultMaxOpenFiles, rlimitOpenFilesLimit);
440 
441             assert(openFilesLimit > numReservedOpenFiles);
442 
443             maxOpenOutputFiles = openFilesLimit - numReservedOpenFiles;
444 
445             /* Remaining command line args.
446              *
447              * Assume remaining args are files. Use standard input if files were not
448              * provided.
449              */
450 
451             files ~= (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
452             cmdArgs.length = 1;
453 
454             /* Suffix - If not provided, use the extension of the first input file.
455              * No suffix if reading from standard input.
456              */
457             if (suffix == invalidFileSuffix) suffix = files[0].extension;
458 
459             /* Ensure forward slash is not included in the filename prefix and suffix.
460              * Forward slash is an invalid Unix filename character. However, open file
461              * calls could match a directory path, resulting in unintended file
462              * creation.
463              *
464              * The other invalid filename character on Unix is the NULL character.
465              * However, the NULL character cannot be entered via Unix command lines,
466              * so there is no need to test for it explicitly.
467              */
468             if (prefix.canFind('/'))
469             {
470                 throw new Exception("'--prefix' cannot contain forward slash characters. Use '--dir' to specify an output directory.");
471             }
472             if (suffix.canFind('/'))
473             {
474                 throw new Exception("'--suffix' cannot contain forward slash characters. Use '--dir' to specify an output directory.");
475             }
476 
477             /* Digit width - If not specified, or specified as zero, the width is
478              * determined by the number of files for --num-files, or defaulted to 3
479              * for --lines-per-file.
480              */
481             if (digitWidth == 0)
482             {
483                 if (numFiles > 0)
484                 {
485                     digitWidth = 1;
486                     uint n = numFiles - 1;
487                     while (n >= 10)
488                     {
489                         n /= 10;
490                         ++digitWidth;
491                     }
492                 }
493                 else
494                 {
495                     digitWidth = 3;
496                 }
497             }
498             assert(digitWidth != 0);
499         }
500         catch (Exception exc)
501         {
502             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
503             return tuple(false, 1);
504         }
505         return tuple(true, 0);
506     }
507 }
508 
509 /* TsvSplitOptions unit tests (command-line argument processing).
510  *
511  * Basic tests. Many cases are covered in executable tests, including all error cases,
512  * as errors write to stderr.
513  */
514 unittest
515 {
516     import std.conv : to;
517     import std.format : format;
518 
519     {
520         auto args = ["unittest", "--lines-per-file", "10"];
521         TsvSplitOptions cmdopt;
522         const r = cmdopt.processArgs(args);
523 
524         assert(cmdopt.files == ["-"]);
525         assert(cmdopt.linesPerFile == 10);
526         assert(cmdopt.keyFields.empty);
527         assert(cmdopt.numFiles == 0);
528         assert(cmdopt.hasHeader == false);
529     }
530     {
531         auto args = ["unittest", "--num-files", "20"];
532         TsvSplitOptions cmdopt;
533         const r = cmdopt.processArgs(args);
534 
535         assert(cmdopt.files == ["-"]);
536         assert(cmdopt.linesPerFile == 0);
537         assert(cmdopt.keyFields.empty);
538         assert(cmdopt.numFiles == 20);
539         assert(cmdopt.hasHeader == false);
540     }
541     {
542         auto args = ["unittest", "-n", "5", "--key-fields", "1-3"];
543         TsvSplitOptions cmdopt;
544         const r = cmdopt.processArgs(args);
545 
546         assert(cmdopt.linesPerFile == 0);
547         assert(cmdopt.keyFields == [0, 1, 2]);
548         assert(cmdopt.numFiles == 5);
549         assert(cmdopt.hasHeader == false);
550         assert(cmdopt.keyIsFullLine == false);
551     }
552     {
553         auto args = ["unittest", "-n", "5", "-k", "0"];
554         TsvSplitOptions cmdopt;
555         const r = cmdopt.processArgs(args);
556 
557         assert(cmdopt.linesPerFile == 0);
558         assert(cmdopt.numFiles == 5);
559         assert(cmdopt.hasHeader == false);
560         assert(cmdopt.keyIsFullLine == true);
561     }
562     {
563         auto args = ["unittest", "-n", "2", "--header"];
564         TsvSplitOptions cmdopt;
565         const r = cmdopt.processArgs(args);
566 
567         assert(cmdopt.headerInOut == true);
568         assert(cmdopt.hasHeader == true);
569         assert(cmdopt.headerIn == false);
570     }
571     {
572         auto args = ["unittest", "-n", "2", "--header-in-only"];
573         TsvSplitOptions cmdopt;
574         const r = cmdopt.processArgs(args);
575 
576         assert(cmdopt.headerInOut == false);
577         assert(cmdopt.hasHeader == true);
578         assert(cmdopt.headerIn == true);
579     }
580 
581     static void testSuffix(string[] args, string expectedSuffix, string[] expectedFiles)
582     {
583         TsvSplitOptions cmdopt;
584         auto savedArgs = args.to!string;
585         const r = cmdopt.processArgs(args);
586 
587         assert(r[0], format("[testSuffix] cmdopt.processArgs(%s) returned false.", savedArgs));
588         assert(cmdopt.suffix == expectedSuffix,
589                format("[testSuffix] Incorrect cmdopt.suffix. Expected: '%s', Actual: '%s'\n   cmdopt.processArgs(%s)",
590                       expectedSuffix, cmdopt.suffix, savedArgs));
591         assert(cmdopt.files == expectedFiles,
592                format("[testSuffix] Incorrect cmdopt.files. Expected: %s, Actual: %s\n   cmdopt.processArgs(%s)",
593                       expectedFiles, cmdopt.files, savedArgs));
594     }
595 
596     testSuffix(["unittest", "-n", "2"], "", ["-"]);
597     testSuffix(["unittest", "-n", "2", "--", "-"], "", ["-"]);
598     testSuffix(["unittest", "-n", "2", "--suffix", "_123"], "_123", ["-"]);
599     testSuffix(["unittest", "-n", "2", "somefile.txt"], ".txt", ["somefile.txt"]);
600     testSuffix(["unittest", "-n", "2", "somefile.txt", "anotherfile.pqr"],
601                ".txt", ["somefile.txt", "anotherfile.pqr"]);
602     testSuffix(["unittest", "-n", "2", "--suffix", ".X", "somefile.txt", "anotherfile.pqr"],
603                ".X", ["somefile.txt", "anotherfile.pqr"]);
604     testSuffix(["unittest", "-n", "2", "--suffix", "", "somefile.txt"],
605                "", ["somefile.txt"]);
606     testSuffix(["unittest", "-n", "2", "--", "-", "somefile.txt"],
607                "", ["-", "somefile.txt"]);
608     testSuffix(["unittest", "-n", "2", "--", "somefile.txt", "-"],
609                ".txt", ["somefile.txt", "-"]);
610 
611     static void testDigitWidth(string[] args, uint expected)
612     {
613         TsvSplitOptions cmdopt;
614         auto savedArgs = args.to!string;
615         const r = cmdopt.processArgs(args);
616 
617         assert(r[0], format("[testDigitWidth] cmdopt.processArgs(%s) returned false.", savedArgs));
618         assert(cmdopt.digitWidth == expected,
619                format("[testDigitWidth] Incorrect cmdopt.digitWidth. Expected: %d, Actual: %d\n   cmdopt.processArgs(%s)",
620                       expected, cmdopt.digitWidth, savedArgs));
621     }
622 
623     testDigitWidth(["unittest", "-n", "2"], 1);
624     testDigitWidth(["unittest", "-n", "2", "--digit-width" , "0"], 1);
625     testDigitWidth(["unittest", "-n", "10"], 1);
626     testDigitWidth(["unittest", "-n", "11"], 2);
627     testDigitWidth(["unittest", "-n", "555"], 3);
628     testDigitWidth(["unittest", "-n", "555", "--digit-width" , "2"], 2);
629     testDigitWidth(["unittest", "-n", "555", "--digit-width" , "4"], 4);
630     testDigitWidth(["unittest", "-l", "10"], 3);
631     testDigitWidth(["unittest", "-l", "10000"], 3);
632     testDigitWidth(["unittest", "-l", "10000", "--digit-width", "0"], 3);
633     testDigitWidth(["unittest", "-l", "10000", "--digit-width", "1"], 1);
634     testDigitWidth(["unittest", "-l", "10000", "--digit-width", "5"], 5);
635 }
636 
637 /** Get the rlimit current number of open files the process is allowed.
638  *
639  * This routine returns the current soft limit on the number of open files the process
640  * is allowed. This is the number returned by the command: '$ ulimit -n'.
641  *
642  * This routine translates this value to a 'uint', as tsv-split uses 'uint' for
643  * tracking output files. The rlimit 'rlim_t' type is usually 'ulong' or 'long'.
644  * RLIM_INFINITY and any value larger than 'uint.max' is translated to 'uint.max'.
645  *
646  * An exception is thrown if call to 'getrlimit' fails.
647  */
648 uint rlimitCurrOpenFilesLimit()
649 {
650     import core.sys.posix.sys.resource :
651         rlim_t, rlimit, getrlimit, RLIMIT_NOFILE, RLIM_INFINITY, RLIM_SAVED_CUR;
652     import std.conv : to;
653 
654     uint currOpenFileLimit = uint.max;
655 
656     rlimit rlimitMaxOpenFiles;
657 
658     if (getrlimit(RLIMIT_NOFILE, &rlimitMaxOpenFiles) != 0)
659     {
660         throw new Exception("Internal error: getrlimit call failed");
661     }
662 
663     if (rlimitMaxOpenFiles.rlim_cur != RLIM_INFINITY &&
664         rlimitMaxOpenFiles.rlim_cur != RLIM_SAVED_CUR &&
665         rlimitMaxOpenFiles.rlim_cur >= 0 &&
666         rlimitMaxOpenFiles.rlim_cur <= uint.max)
667     {
668         currOpenFileLimit = rlimitMaxOpenFiles.rlim_cur.to!uint;
669     }
670 
671     return currOpenFileLimit;
672 }
673 
674 /** Invokes the proper split routine based on the command line arguments.
675  *
676  * This routine is the top-level control after command line argument processing is
677  * done. It's primary job is to set up data structures and invoke the correct
678  * processing routine based on the command line arguments.
679  */
680 void tsvSplit(TsvSplitOptions cmdopt)
681 {
682     import std.format : format;
683 
684     if (cmdopt.linesPerFile != 0)
685     {
686         splitByLineCount(cmdopt);
687     }
688     else
689     {
690         /* Randomly distribute input lines to a specified number of files. */
691 
692         auto outputFiles =
693             SplitOutputFiles(cmdopt.numFiles, cmdopt.dir, cmdopt.prefix, cmdopt.suffix,
694                              cmdopt.digitWidth, cmdopt.headerInOut, cmdopt.maxOpenOutputFiles);
695 
696         if (!cmdopt.appendToExistingFiles)
697         {
698             string existingFile = outputFiles.checkIfFilesExist;
699 
700             if (existingFile.length != 0)
701             {
702                 throw new Exception(
703                     format("One or more output files already exist. Use '--a|append' to append to existing files. File: '%s'.",
704                            existingFile));
705             }
706         }
707 
708         if (cmdopt.keyFields.length == 0)
709         {
710             splitLinesRandomly(cmdopt, outputFiles);
711         }
712         else
713         {
714             splitLinesByKey(cmdopt, outputFiles);
715         }
716     }
717 }
718 
719 /** A SplitOutputFiles struct holds a collection of output files.
720  *
721  * This struct manages a collection of output files used when writing to multiple
722  * files at once. This includes constructing filenames, opening and closing files,
723  * and writing data and header lines.
724  *
725  * Both random assignment (splitLinesRandomly) and random assignment by key
726  * (splitLinesByKey) use a SplitOutputFiles struct to manage output files.
727  *
728  * The main properties of the output file set are specified in the constuctor. The
729  * exception is the header line. This is not known until the first input file is
730  * read, so it is specified in a separate 'setHeader' call.
731  *
732  * Individual output files are written to based on their zero-based index in the
733  * output collection. The caller selects the output file number to write to and
734  * calls 'writeDataLine' to write a line. The header is written if needed.
735  */
736 struct SplitOutputFiles
737 {
738     import std.conv : to;
739     import std.file : exists;
740     import std.format : format;
741     import std.path : buildPath;
742     import std.stdio : File;
743 
744     static struct OutputFile
745     {
746         string filename;
747         File ofile;
748         bool hasData;
749         bool isOpen;    // Track separately due to https://github.com/dlang/phobos/pull/7397
750     }
751 
752     private uint _numFiles;
753     private bool _writeHeaders;
754     private uint _maxOpenFiles;
755 
756     private OutputFile[] _outputFiles;
757     private uint _numOpenFiles = 0;
758     private string _header;
759 
760     this(uint numFiles, string dir, string filePrefix, string fileSuffix,
761          uint fileDigitWidth, bool writeHeaders, uint maxOpenFiles)
762     {
763         assert(numFiles >= 2);
764         assert(maxOpenFiles >= 1);
765 
766         _numFiles = numFiles;
767         _writeHeaders = writeHeaders;
768         _maxOpenFiles = maxOpenFiles;
769 
770         _outputFiles.length = numFiles;
771 
772         /* Filename assignment. */
773         foreach (i, ref f; _outputFiles)
774         {
775             f.filename =
776                 buildPath(dir, format("%s%.*d%s", filePrefix, fileDigitWidth, i, fileSuffix));
777         }
778     }
779 
780     /* Destructor ensures all files are closed.
781      *
782      * Note: A dual check on whether the file is open is made. This is to avoid a
783      * Phobos bug where std.File doesn't properly maintain the state of open files
784      * if the File.open call fails. See: https://github.com/dlang/phobos/pull/7397.
785      */
786     ~this()
787     {
788         foreach (ref f; _outputFiles)
789         {
790             if (f.isOpen && f.ofile.isOpen)
791             {
792                 assert(_numOpenFiles >= 1);
793 
794                 f.ofile.close;
795                 f.isOpen = false;
796                 _numOpenFiles--;
797             }
798         }
799     }
800 
801     /* Check if any of the files already exist.
802      *
803      * Returns the empty string if none of the files exist. Otherwise returns the
804      * filename of the first existing file found. This is to facilitate error
805      * message generation.
806      */
807     string checkIfFilesExist()
808     {
809         foreach (f; _outputFiles) if (f.filename.exists) return f.filename;
810         return "";
811     }
812 
813     /* Sets the header line.
814      *
815      * Should be called prior to writeDataLine when headers are being written. This
816      * method is separate from the constructor because the header is not available
817      * until the first line of a file is read.
818      *
819      * Headers are only written if 'writeHeaders' is specified as true in the
820      * constructor. As a convenience, this routine can be called even if headers are
821      * not being written.
822      */
823     void setHeader(const char[] header)
824     {
825         _header = header.to!string;
826     }
827 
828     /* Picks a random file to close. Used when the open file handle limit has been
829      * reached.
830      */
831     private void closeSomeFile()
832     {
833         import std.random : uniform;
834         assert(_numOpenFiles > 0);
835 
836         immutable uint start = uniform(0, _numFiles);
837 
838         foreach (i; cycle(iota(_numFiles), start).take(_numFiles))
839         {
840             if (_outputFiles[i].isOpen)
841             {
842                 _outputFiles[i].ofile.close;
843                 _outputFiles[i].isOpen = false;
844                 _numOpenFiles--;
845 
846                 return;
847             }
848         }
849 
850         assert(false, "[SplitOutputFiles.closeSomeFile]: Could not find file to close.");
851     }
852 
853     /* Write a line to the specified file number.
854      *
855      * A header is written to the file if headers are being written and this is the
856      * first data written to the file.
857      */
858     void writeDataLine(uint fileNum, const char[] data)
859     {
860         assert(fileNum < _numFiles);
861         assert(fileNum < _outputFiles.length);
862         assert(_numOpenFiles <= _maxOpenFiles);
863 
864         OutputFile* outputFile = &_outputFiles[fileNum];
865 
866         if (!outputFile.isOpen)
867         {
868             if (_numOpenFiles == _maxOpenFiles) closeSomeFile();
869             assert(_numOpenFiles < _maxOpenFiles);
870 
871             outputFile.ofile = outputFile.filename.File("a");
872             outputFile.isOpen = true;
873             _numOpenFiles++;
874 
875             if (!outputFile.hasData)
876             {
877                 ulong filesize = outputFile.ofile.size;
878                 outputFile.hasData = (filesize > 0 && filesize != ulong.max);
879             }
880         }
881 
882         if (_writeHeaders && !outputFile.hasData) outputFile.ofile.writeln(_header);
883 
884         outputFile.ofile.writeln(data);
885         outputFile.hasData = true;
886     }
887 }
888 
889 /** Write input lines to multiple files, randomly selecting an output file for each line.
890  */
891 void splitLinesRandomly(TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles)
892 {
893     import std.random : Random = Mt19937, uniform;
894     import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix;
895 
896     auto randomGenerator = Random(cmdopt.seed);
897 
898     /* Process each line. */
899     foreach (inputFileNum, filename; cmdopt.files)
900     {
901         auto inputStream = (filename == "-") ? stdin : filename.File();
902         foreach (ulong fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1))
903         {
904             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
905             if (fileLineNum == 1 && cmdopt.hasHeader)
906             {
907                 if (inputFileNum == 0) outputFiles.setHeader(line);
908             }
909             else
910             {
911                 immutable uint outputFileNum = uniform(0, cmdopt.numFiles, randomGenerator);
912                 outputFiles.writeDataLine(outputFileNum, line);
913             }
914         }
915 
916         /* Close input files immediately after use to preserve open file handles.
917          * File close occurs when variable goes out scope, but not immediately in the
918          * case of loop termination. Avoids open file errors when the number of
919          * output files exceeds the open file limit.
920          */
921         if (filename != "-") inputStream.close;
922     }
923 }
924 
925 /** Write input lines to multiple output files using fields as a random selection key.
926  *
927  * Each input line is written to an output file. The output file is chosen using
928  * fields as a key. Each unique key is assigned to a file. All lines having the
929  * same key are written to the same file.
930  */
931 void splitLinesByKey(TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles)
932 {
933     import std.algorithm : splitter;
934     import std.conv : to;
935     import std.digest.murmurhash;
936     import tsv_utils.common.utils : bufferedByLine, InputFieldReordering, throwIfWindowsNewlineOnUnix;
937 
938     assert(cmdopt.keyFields.length > 0);
939 
940     immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys.
941 
942     /* Create a mapping for the key fields. */
943     auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields);
944 
945     /* Process each line. */
946     foreach (inputFileNum, filename; cmdopt.files)
947     {
948         auto inputStream = (filename == "-") ? stdin : filename.File();
949         foreach (ulong fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1))
950         {
951             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum);
952             if (fileLineNum == 1 && cmdopt.hasHeader)
953             {
954                 if (inputFileNum == 0) outputFiles.setHeader(line);
955             }
956             else
957             {
958                 /* Murmurhash works by successively adding individual keys, then finalizing.
959                  * Adding individual keys is simpler if the full-line-as-key and individual
960                  * fields as keys cases are separated.
961                  */
962                 auto hasher = MurmurHash3!32(cmdopt.seed);
963 
964                 if (cmdopt.keyIsFullLine)
965                 {
966                     hasher.put(cast(ubyte[]) line);
967                 }
968                 else
969                 {
970                     assert(keyFieldsReordering !is null);
971 
972                     /* Gather the key field values and assemble the key. */
973                     keyFieldsReordering.initNewLine;
974                     foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
975                     {
976                         keyFieldsReordering.processNextField(fieldIndex, fieldValue);
977                         if (keyFieldsReordering.allFieldsFilled) break;
978                     }
979 
980                     if (!keyFieldsReordering.allFieldsFilled)
981                     {
982                         import std.format : format;
983                         throw new Exception(
984                             format("Not enough fields in line. File: %s, Line: %s",
985                                    (filename == "-") ? "Standard Input" : filename, fileLineNum));
986                     }
987 
988                     foreach (count, key; keyFieldsReordering.outputFields.enumerate)
989                     {
990                         if (count > 0) hasher.put(delimArray);
991                         hasher.put(cast(ubyte[]) key);
992                     }
993                 }
994 
995                 hasher.finish;
996                 immutable uint outputFileNum = hasher.get % cmdopt.numFiles;
997                 outputFiles.writeDataLine(outputFileNum, line);
998             }
999         }
1000 
1001         /* Close input files immediately after use to preserve open file handles.
1002          * File close occurs when variable goes out scope, but not immediately in the
1003          * case of loop termination. Avoids open file errors when the number of
1004          * output files exceeds the open file limit.
1005          */
1006         if (filename != "-") inputStream.close;
1007     }
1008 }
1009 
1010 /** Write input lines to multiple files, splitting based on line count.
1011  *
1012  * Note: readBufferSize is an argument primarily for unit test purposes. Normal uses
1013  * should use the default value.
1014  */
1015 void splitByLineCount(TsvSplitOptions cmdopt, const size_t readBufferSize = 1024L * 512L)
1016 {
1017     import std.array : appender;
1018     import std.file : exists;
1019     import std.format : format;
1020     import std.path : buildPath;
1021     import std.stdio : File;
1022 
1023     assert (readBufferSize > 0);
1024     ubyte[] readBuffer = new ubyte[readBufferSize];
1025 
1026     auto header = appender!(ubyte[])();
1027     bool headerSaved = !cmdopt.headerInOut;  // True if 'header' has been saved, or does not need to be.
1028     size_t nextOutputFileNum = 0;
1029     File outputFile;
1030     string outputFileName;
1031     bool isOutputFileOpen = false;           // Open file status tracked separately due to phobos bugs
1032     size_t outputFileRemainingLines;
1033 
1034     /* nextNewlineIndex finds the index of the next newline character. It is an
1035      * alternative to std.algorithm.countUntil. Invoking 'find' directly results
1036      * 'memchr' being used (faster). The current 'countUntil' implementation does
1037      * forward to find, but the way it is done avoids the memchr call optimization.
1038      */
1039     static long nextNewlineIndex(const ubyte[] buffer)
1040     {
1041         import std.algorithm : find;
1042         immutable ubyte newlineChar = '\n';
1043         immutable size_t buflen = buffer.length;
1044         immutable size_t findlen = buffer.find(newlineChar).length;
1045 
1046         return findlen > 0 ? buflen - findlen : -1;
1047     }
1048 
1049     foreach (filename; cmdopt.files)
1050     {
1051         auto inputStream = (filename == "-") ? stdin : filename.File("rb");
1052         bool isReadingHeader = cmdopt.hasHeader;
1053 
1054         foreach (ref ubyte[] inputChunk; inputStream.byChunk(readBuffer))
1055         {
1056             size_t nextOutputChunkStart = 0;
1057 
1058             if (isReadingHeader)
1059             {
1060                 immutable newlineIndex = nextNewlineIndex(inputChunk);
1061 
1062                 if (newlineIndex == -1)
1063                 {
1064                     /* Rare case - Header line longer than read buffer. Keep reading
1065                      * the header.
1066                      */
1067                     if (!headerSaved) put(header, inputChunk);
1068                     continue;
1069                 }
1070                 else
1071                 {
1072                     if (!headerSaved)
1073                     {
1074                         put(header, inputChunk[0 .. newlineIndex + 1]);
1075                         headerSaved = true;
1076                     }
1077                     isReadingHeader = false;
1078                     nextOutputChunkStart = newlineIndex + 1;
1079                 }
1080             }
1081 
1082             /* Done with the header. Process the rest of the inputChunk. */
1083 
1084             auto remainingInputChunk = inputChunk[nextOutputChunkStart .. $];
1085 
1086             while (!remainingInputChunk.empty)
1087             {
1088                 /* See if the next output file needs to be opened. */
1089                 if (!isOutputFileOpen)
1090                 {
1091                     outputFileName =
1092                         buildPath(cmdopt.dir,
1093                                   format("%s%.*d%s", cmdopt.prefix,
1094                                          cmdopt.digitWidth, nextOutputFileNum, cmdopt.suffix));
1095 
1096                     if (!cmdopt.appendToExistingFiles && outputFileName.exists)
1097                     {
1098                         throw new Exception(
1099                             format("Output file already exists. Use '--a|append' to append to existing files. File: '%s'.",
1100                                    outputFileName));
1101                     }
1102 
1103                     outputFile = outputFileName.File("ab");
1104                     isOutputFileOpen = true;
1105                     ++nextOutputFileNum;
1106                     outputFileRemainingLines = cmdopt.linesPerFile;
1107 
1108                     assert(headerSaved);
1109 
1110                     if (cmdopt.headerInOut)
1111                     {
1112                         ulong filesize = outputFile.size;
1113                         if (filesize == 0 || filesize == ulong.max) outputFile.rawWrite(header.data);
1114                     }
1115                 }
1116 
1117                 /* Find more newlines for the current output file. */
1118 
1119                 assert(outputFileRemainingLines > 0);
1120 
1121                 size_t nextOutputChunkEnd = nextOutputChunkStart;
1122 
1123                 while (outputFileRemainingLines != 0 && !remainingInputChunk.empty)
1124                 {
1125                     /* Note: newLineIndex is relative to 'remainingInputChunk', not
1126                      * 'inputChunk'. Updates to variables referring to 'inputChunk'
1127                      * need to reflect this. In particular, 'nextOutputChunkEnd'.
1128                      */
1129                     immutable newlineIndex = nextNewlineIndex(remainingInputChunk);
1130 
1131                     if (newlineIndex == -1)
1132                     {
1133                         nextOutputChunkEnd = inputChunk.length;
1134                     }
1135                     else
1136                     {
1137                         --outputFileRemainingLines;
1138                         nextOutputChunkEnd += (newlineIndex + 1);
1139                     }
1140 
1141                     remainingInputChunk = inputChunk[nextOutputChunkEnd .. $];
1142                 }
1143 
1144                 assert(nextOutputChunkStart < nextOutputChunkEnd);
1145                 assert(nextOutputChunkEnd <= inputChunk.length);
1146 
1147                 outputFile.rawWrite(inputChunk[nextOutputChunkStart .. nextOutputChunkEnd]);
1148 
1149                 if (outputFileRemainingLines == 0)
1150                 {
1151                     outputFile.close;
1152                     isOutputFileOpen = false;
1153                 }
1154 
1155                 nextOutputChunkStart = nextOutputChunkEnd;
1156 
1157                 assert(remainingInputChunk.length == inputChunk.length - nextOutputChunkStart);
1158             }
1159         }
1160     }
1161 }
1162 
1163 /* splitByLineCount unit tests.
1164  *
1165  * These tests are primarily for buffer management. There are edge cases involving the
1166  * interaction buffer size, input file size, lines-per-file, and newline placement
1167  * that are difficult to test against the executable.
1168  */
1169 unittest
1170 {
1171     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
1172     import std.algorithm : min;
1173     import std.array : appender;
1174     import std.conv : to;
1175     import std.file : exists, mkdir, rmdirRecurse;
1176     import std.format : format;
1177     import std.path : buildPath;
1178     import std.process : escapeShellCommand, executeShell;
1179 
1180     /* Test setup
1181      *
1182      * A set of twenty file input files is created, with names: input_NxM.txt, where
1183      * N is the number of characters in each row and M is the number of rows (lines).
1184      * The resulting files are put in the "lc_input" directory ('inputDir' variable)
1185      * and have names:
1186      *    input_0x2.txt, input_0x3.txt, ... input_5x5.txt.
1187      *
1188      * A standalone block of code produces the expected result files for splitting an
1189      * input file into a set of output files. This duplicates the splitByLineCount
1190      * output. This is done for lines-per-file counts 1 to 5. Each result set is place
1191      * ina subdirectory under "lc_expected" ('expectedDir' variable). Subdirectories
1192      * have names like: "0x2_by_1", "0x3_by_1", ..., "5x5_by_4".
1193      *
1194      * splitByLine is called for all the same input files and lines-per-file settings used
1195      * to produce the expected output. This is done via testSplitByLineCount, which calls
1196      * command line argument processing and splitByLine, similar to how the main program
1197      * works. The results are written to a subdirectory. The subdirectory is compared to
1198      * the expected output directory using the system 'diff' command.
1199      *
1200      * splitByLine is multiple times for each expected output case. The different calls
1201      * iterate over a series of small ReadBufferSizes. This is how tests for edge cases
1202      * in the readBufferSize vs line lengths, newline placement, etc., is accomplished.
1203      *
1204      * Note: One way to understand what is going on is to comment out the line:
1205      *
1206      *    scope(exit) testDir.rmdirRecurse;
1207      *
1208      * Then run the test (e.g. 'make test') and look at the directory structure left
1209      * behind. Print out the 'testDir' directory to see where it is located.
1210      */
1211 
1212     /* testSplitByLineCount acts as a surrogate for main() and tsvSplit(). It makes the
1213      * call to splitByLineCount and calls 'diff' to compare the output directory to the
1214      * expected directory. An assert is thrown if the directories do not match.
1215      */
1216     static void testSplitByLineCount(string[] cmdArgs, string expectedDir,
1217                                  size_t readBufferSize = 1024L * 512L)
1218     {
1219         import std.array : appender;
1220         import std.format : format;
1221 
1222         assert(cmdArgs.length > 0, "[testSplitByLineCount] cmdArgs must not be empty.");
1223 
1224         auto formatAssertMessage(T...)(string msg, T formatArgs)
1225         {
1226             auto formatString = "[testSplitByLineCount] %s: " ~ msg;
1227             return format(formatString, cmdArgs[0], formatArgs);
1228         }
1229 
1230         TsvSplitOptions cmdopt;
1231         auto savedCmdArgs = cmdArgs.to!string;
1232         auto r = cmdopt.processArgs(cmdArgs);
1233         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
1234         assert(cmdopt.linesPerFile != 0, "[testSplitByLineCount] --lines-per-file is required.");
1235         assert(!cmdopt.dir.empty, "[testSplitByLineCount] --dir is required.");
1236 
1237         splitByLineCount(cmdopt, readBufferSize);
1238 
1239         /* Diff command setup. */
1240         auto diffCmdArgs = ["diff", expectedDir, cmdopt.dir];
1241         auto diffResult = executeShell(escapeShellCommand(diffCmdArgs));
1242         assert(diffResult.status == 0,
1243                format("[testSplitByLineCount]\n  cmd: %s\n  readBufferSize: %d\n  expectedDir: %s\n------ Diff ------%s\n-------",
1244                       savedCmdArgs, readBufferSize, expectedDir, diffResult.output));
1245     }
1246 
1247     auto testDir = makeUnittestTempDir("tsv_split");
1248     scope(exit) testDir.rmdirRecurse;
1249 
1250     auto inputDir = buildPath(testDir, "lc_input");
1251     auto outputDir = buildPath(testDir, "lc_output");
1252     auto expectedDir = buildPath(testDir, "lc_expected");
1253 
1254     mkdir(inputDir);
1255     mkdir(outputDir);
1256     mkdir(expectedDir);
1257 
1258     static string buildInputFilePath(string dir, long inputLineLength, long inputFileNumLines)
1259     {
1260         return buildPath(dir, format("input_%dx%d.txt", inputLineLength, inputFileNumLines));
1261     }
1262 
1263     string[5] outputRowData =
1264         [
1265             "abcde",
1266             "fghij",
1267             "klmno",
1268             "pqrst",
1269             "uvwxy"
1270         ];
1271 
1272     /* The main test loop. Iterates over input line lengths, numbers of rows,
1273      * lines-per-file, and finally readBufferSize lengths. All combos are tested.
1274      */
1275     foreach (inputLineLength; 0 .. 6)
1276     {
1277         foreach (inputFileNumLines; 2 .. 6)
1278         {
1279             auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines);
1280 
1281             {
1282                 auto ofile = inputFile.File("w");
1283                 auto output = appender!(char[])();
1284                 foreach (m; 0 .. inputFileNumLines)
1285                 {
1286                     put(output, outputRowData[m][0 .. inputLineLength]);
1287                     put(output, '\n');
1288                 }
1289                 ofile.write(output.data);
1290                 ofile.close;
1291             }
1292 
1293             /* Iterate over the different lines-per-file lengths.
1294              * - Create an expected output directory and files for each.
1295              * - Test with different readBufferSize values.
1296              */
1297             foreach (outputFileNumLines; 1 .. min(5, inputFileNumLines))
1298             {
1299                 auto expectedSubDir =
1300                     buildPath(expectedDir, format("%dx%d_by_%d", inputLineLength,
1301                                                   inputFileNumLines, outputFileNumLines));
1302                 mkdir(expectedSubDir);
1303 
1304                 size_t filenum = 0;
1305                 size_t linesWritten = 0;
1306                 while (linesWritten < inputFileNumLines)
1307                 {
1308                     auto expectedFile = buildPath(expectedSubDir, format("part_%d.txt", filenum));
1309                     auto f = expectedFile.File("w");
1310                     auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten);
1311                     foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite])
1312                     {
1313                         f.writeln(line[0 .. inputLineLength]);
1314                     }
1315                     linesWritten += linesToWrite;
1316                     ++filenum;
1317                     f.close;
1318                 }
1319 
1320                 /* Test the different readBufferSizes.
1321                  * - An output directory is created for the run and deleted afterward.
1322                  * - First test the default size.
1323                  * - Then iterate overs small readBufferSize values.
1324                  */
1325                 auto outputSubDir =
1326                     buildPath(outputDir, format("%dx%d_by_%d", inputLineLength,
1327                                                 inputFileNumLines, outputFileNumLines));
1328                 mkdir(outputSubDir);
1329 
1330                 testSplitByLineCount(
1331                     ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir,
1332                      "--digit-width", "1", inputFile],
1333                     expectedSubDir);
1334 
1335                 outputSubDir.rmdirRecurse;
1336 
1337                 foreach (readBufSize; 1 .. 8)
1338                 {
1339                      mkdir(outputSubDir);
1340 
1341                      testSplitByLineCount(
1342                          ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir,
1343                           "--digit-width", "1", inputFile],
1344                          expectedSubDir, readBufSize);
1345 
1346                      outputSubDir.rmdirRecurse;
1347                 }
1348             }
1349         }
1350     }
1351 
1352     {
1353         /* Tests for the special case where readBufferSize is smaller than the header
1354          * line. We'll reuse the input_5x4.txt input file and write 1 line-per-file.
1355          */
1356         immutable inputLineLength = 5;
1357         immutable inputFileNumLines = 4;
1358         immutable outputFileNumLines = 1;
1359 
1360         auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines);
1361         assert(inputFile.exists);
1362 
1363         auto expectedSubDirHeader =
1364             buildPath(expectedDir, format("%dx%d_by_%d_header", inputLineLength,
1365                                           inputFileNumLines, outputFileNumLines));
1366 
1367         auto expectedSubDirHeaderInOnly =
1368             buildPath(expectedDir, format("%dx%d_by_%d_header_in_only", inputLineLength,
1369                                           inputFileNumLines, outputFileNumLines));
1370 
1371         mkdir(expectedSubDirHeader);
1372         mkdir(expectedSubDirHeaderInOnly);
1373 
1374         /* Generate the expected results. Cheat by starting with linesWritten = 1. This
1375          * automatically excludes the header line, but keeps the loop code consistent
1376          * with the main test loop.
1377          */
1378         size_t filenum = 0;
1379         size_t linesWritten = 1;
1380         while (linesWritten < inputFileNumLines)
1381         {
1382             auto expectedFileHeader = buildPath(expectedSubDirHeader, format("part_%d.txt", filenum));
1383             auto expectedFileHeaderInOnly = buildPath(expectedSubDirHeaderInOnly,
1384                                                       format("part_%d.txt", filenum));
1385             auto fHeader = expectedFileHeader.File("w");
1386             auto fHeaderInOnly = expectedFileHeaderInOnly.File("w");
1387             auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten);
1388 
1389             fHeader.writeln(outputRowData[0][0 .. inputLineLength]);
1390             foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite])
1391             {
1392                 fHeader.writeln(line[0 .. inputLineLength]);
1393                 fHeaderInOnly.writeln(line[0 .. inputLineLength]);
1394             }
1395             linesWritten += linesToWrite;
1396             ++filenum;
1397             fHeader.close;
1398             fHeaderInOnly.close;
1399         }
1400 
1401         /* Now run the tests. */
1402         auto outputSubDirHeader =
1403             buildPath(outputDir, format("%dx%d_by_%d_header", inputLineLength,
1404                                         inputFileNumLines, outputFileNumLines));
1405         auto outputSubDirHeaderInOnly =
1406             buildPath(outputDir, format("%dx%d_by_%d_header_in_only", inputLineLength,
1407                                         inputFileNumLines, outputFileNumLines));
1408 
1409         foreach (readBufSize; 1 .. 6)
1410         {
1411             mkdir(outputSubDirHeader);
1412             mkdir(outputSubDirHeaderInOnly);
1413 
1414             testSplitByLineCount(
1415                 ["test", "--header", "--lines-per-file", outputFileNumLines.to!string,
1416                  "--dir", outputSubDirHeader, "--digit-width", "1", inputFile],
1417                 expectedSubDirHeader, readBufSize);
1418 
1419             testSplitByLineCount(
1420                 ["test", "--header-in-only", "--lines-per-file", outputFileNumLines.to!string,
1421                  "--dir", outputSubDirHeaderInOnly, "--digit-width", "1", inputFile],
1422                 expectedSubDirHeaderInOnly, readBufSize);
1423 
1424             outputSubDirHeader.rmdirRecurse;
1425             outputSubDirHeaderInOnly.rmdirRecurse;
1426         }
1427     }
1428 }