tsv_utils.tsv_split source code

1 /**
2 Command line tool for splitting a files (or files) into multiple output files.
3 Several methods for splitting are available, including splitting by line count,
4 splitting by random assignment, and splitting by random assignment based on
5 key fields.
6 
7 Copyright (c) 2020, eBay Inc.
8 Initially written by Jon Degenhardt
9 
10 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
11 */
12 module tsv_utils.tsv_split;
13 
14 import std.exception : enforce;
15 import std.format : format;
16 import std.range;
17 import std.stdio;
18 import std.typecons : tuple, Flag;
19 
20 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
21 
22 version(unittest)
23 {
24     // When running unit tests, use main from -main compiler switch.
25 }
26 else
27 {
28     /** Main program.
29      *
30      * Invokes command line argument processing and calls tsvSplit to do the real
31      * work. Errors occurring during processing are caught and reported to the user.
32      */
33     int main(string[] cmdArgs)
34     {
35         /* When running in DMD code coverage mode, turn on report merging. */
36         version(D_Coverage) version(DigitalMars)
37         {
38             import core.runtime : dmd_coverSetMerge;
39             dmd_coverSetMerge(true);
40         }
41 
42         TsvSplitOptions cmdopt;
43         const r = cmdopt.processArgs(cmdArgs);
44         if (!r[0]) return r[1];
45         version(LDC_Profile)
46         {
47             import ldc.profile : resetAll;
48             resetAll();
49         }
50         try
51         {
52             tsvSplit(cmdopt);
53         }
54         catch (Exception exc)
55         {
56             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
57             return 1;
58         }
59         return 0;
60     }
61 }
62 
63 immutable helpText = q"EOS
64 Synopsis: tsv-split [options] [file...]
65 
66 Split input lines into multiple output files. There are three modes of
67 operation:
68 
69 * Fixed number of lines per file (--l|lines-per-file NUM): Each input
70   block of NUM lines is written to a new file. Similar to Unix 'split'.
71 
72 * Random assignment (--n|num-files NUM): Each input line is written to a
73   randomly selected output file. Random selection is from NUM files.
74 
75 * Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS):
76   Input lines are written to output files using fields as a key. Each
77   unique key is randomly assigned to one of NUM output files. All lines
78   with the same key are written to the same file.
79 
80 By default, files are written to the current directory and have names
81 of the form 'part_NNN<suffix>', with 'NNN' being a number and <suffix>
82 being the extension of the first input file. If the input file is
83 'file.txt', the names will take the form 'part_NNN.txt'. The output
84 directory and file names are customizable.
85 
86 Fields are specified using field number or field name. Field names
87 require that the input file has a header line.
88 
89 Use '--help-verbose' for more detailed information.
90 
91 Options:
92 EOS";
93 
94 immutable helpTextVerbose = q"EOS
95 Synopsis: tsv-split [options] [file...]
96 
97 Split input lines into multiple output files. There are three modes of
98 operation:
99 
100 * Fixed number of lines per file (--l|lines-per-file NUM): Each input
101   block of NUM lines is written to a new file. Similar to Unix 'split'.
102 
103 * Random assignment (--n|num-files NUM): Each input line is written to a
104   randomly selected output file. Random selection is from NUM files.
105 
106 * Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS):
107   Input lines are written to output files using fields as a key. Each
108   unique key is randomly assigned to one of NUM output files. All lines
109   with the same key are written to the same file.
110 
111 Output files: By default, files are written to the current directory and
112 have names of the form 'part_NNN<suffix>', with 'NNN' being a number and
113 <suffix> being the extension of the first input file. If the input file is
114 'file.txt', the names will take the form 'part_NNN.txt'. The suffix is
115 empty when reading from standard input. The numeric part defaults to 3
116 digits for '--l|lines-per-files'. For '--n|num-files' enough digits are
117 used so all filenames are the same length. The output directory and file
118 names are customizable.
119 
120 Header lines: There are two ways to handle input with headers: write a
121 header to all output files (--H|header), or exclude headers from all
122 output files ('--I|header-in-only'). The best choice depends on the
123 follow-up processing. All tsv-utils tools support header lines in multiple
124 input files, but many other tools do not. For example, GNU parallel works
125 best on files without header lines.
126 
127 Random assignment (--n|num-files): Random distribution of records to a set
128 of files is a common task. When data fits in memory the preferred approach
129 is usually to shuffle the data and split it into fixed sized blocks. E.g.
130 'tsv-sample data.tsv | tsv-split -l NUM'. However, alternate approaches
131 are needed when data is too large for convenient shuffling. tsv-split's
132 random assignment feature is useful in this case. Each input line is
133 written to a randomly selected output file. Note that output files will
134 have similar but not identical numbers of records.
135 
136 Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): This
137 splits a data set into multiple files sharded by key. All lines with the
138 same key are written to the same file. This partitioning enables parallel
139 computation based on the key. For example, statistical calculation
140 ('tsv-summarize --group-by') or duplicate removal ('tsv-uniq --fields').
141 These operations can be parallelized using tools like GNU parallel, which
142 simplifies concurrent operations on multiple files. Fields are specified
143 using field number or field name. Field names require that the input file
144 has a header line. Use '--help-fields' for details about field names.
145 
146 Random seed: By default, each tsv-split invocation using random assignment
147 or random assignment by key produces different assignments to the output
148 files. Using '--s|static-seed' changes this so multiple runs produce the
149 same assignments. This works by using the same random seed each run. The
150 seed can be specified using '--v|seed-value'.
151 
152 Appending to existing files: By default, an error is triggered if an
153 output file already exists. '--a|append' changes this so that lines are
154 appended to existing files. (Header lines are not appended to files with
155 data.) This is useful when adding new data to files created by a previous
156 tsv-split run. Random assignment should use the same '--n|num-files' value
157 each run, but different random seeds (avoid '--s|static-seed'). Random
158 assignment by key should use the same '--n|num-files', '--k|key-fields',
159 and seed ('--s|static-seed' or '--v|seed-value') each run.
160 
161 Max number of open files: Random assignment and random assignment by key
162 are dramatically faster when all output files are kept open. However,
163 keeping a large numbers of open files can bump into system limits or limit
164 resources available to other processes. By default, tsv-split uses up to
165 4096 open files or the system per-process limit, whichever is smaller.
166 This can be changed using '--max-open-files', though it cannot be set
167 larger than the system limit. The system limit varies considerably between
168 systems. On many systems it is unlimited. On MacOS it is often set to 256.
169 Use Unix 'ulimit' to display and modify the limits:
170 * 'ulimit -n' - Show the "soft limit". The per-process maximum.
171 * 'ulimit -Hn' - Show the "hard limit". The max allowed soft limit.
172 * 'ulimit -Sn NUM' - Change the "soft limit" to NUM.
173 
174 Examples:
175 
176   # Split a 10 million line file into 1000 files, 10,000 lines each.
177   # Output files are part_000.txt, part_001.txt, ... part_999.txt.
178   tsv-split data.txt --lines-per-file 10000
179 
180   # Same as the previous example, but write files to a subdirectory.
181   tsv-split data.txt --dir split_files --lines-per-file 10000
182 
183   # Split a file into 10,000 line files, writing a header line to each
184   tsv-split data.txt -H --lines-per-file 10000
185 
186   # Same as the previous example, but dropping the header line.
187   tsv-split data.txt -I --lines-per-file 10000
188 
189   # Randomly assign lines to 1000 files
190   tsv-split data.txt --num-files 1000
191 
192   # Randomly assign lines to 1000 files while keeping unique entries
193   # from the 'url' field together.
194   tsv-split data.tsv -H -k url --num-files 1000
195 
196   # Randomly assign lines to 1000 files. Later, randomly assign lines
197   # from a second data file to the same output files.
198   tsv-split data1.tsv -n 1000
199   tsv-split data2.tsv -n 1000 --append
200 
201   # Randomly assign lines to 1000 files using field 3 as a key.
202   # Later, add a second file to the same output files.
203   tsv-split data1.tsv -n 1000 -k 3 --static-seed
204   tsv-split data2.tsv -n 1000 -k 3 --static-seed --append
205 
206   # Change the system per-process open file limit for one command.
207   # The parens create a sub-shell. The current shell is not changed.
208   ( ulimit -Sn 1000 && tsv-split --num-files 1000 data.txt )
209 
210 Options:
211 EOS";
212 
213 /** Container for command line options and derived data.
214  *
215  * TsvSplitOptions handles several aspects of command line options. On the input side,
216  * it defines the command line options available, performs validation, and sets up any
217  * derived state based on the options provided. These activities are handled by the
218  * processArgs() member.
219  *
220  * Once argument processing is complete, TsvSplitOptions is used as a container
221  * holding the specific processing options used by the splitting algorithms.
222  */
223 struct TsvSplitOptions
224 {
225     import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader;
226 
227     enum invalidFileSuffix = "///////";
228 
229     string programName;                        /// Program name
230     InputSourceRange inputSources;             /// Input files
231     bool headerInOut = false;                  /// --H|header
232     bool headerIn = false;                     /// --I|header-in-only
233     size_t linesPerFile = 0;                   /// --l|lines-per-file
234     uint numFiles = 0;                         /// --n|num-files
235     size_t[] keyFields;                        /// Derived: --k|key-fields
236     string dir;                                /// --dir
237     string prefix = "part_";                   /// --prefix
238     string suffix = invalidFileSuffix;         /// --suffix
239     uint digitWidth = 0;                       /// --w|digit-width
240     bool appendToExistingFiles = false;        /// --a|append
241     bool staticSeed = false;                   /// --s|static-seed
242     uint seedValueOptionArg = 0;               /// --v|seed-value
243     char delim = '\t';                         /// --d|delimiter
244     uint maxOpenFilesArg = 0;                  /// --max-open-files
245     bool hasHeader = false;                    /// Derived. True if either '--H|header' or '--I|header-in-only' is set.
246     bool keyIsFullLine = false;                /// Derived. True if '--f|fields 0' is specfied.
247     bool usingUnpredictableSeed = true;        /// Derived from --static-seed, --seed-value
248     uint seed = 0;                             /// Derived from --static-seed, --seed-value
249     uint maxOpenOutputFiles;                   /// Derived.
250 
251     /** Process tsv-split command line arguments.
252      *
253      * Defines the command line options, performs validation, and derives additional
254      * state. std.getopt.getopt is called to do the main option processing followed
255      * additional validation and derivation.
256      *
257      * Help text is printed to standard output if help was requested. Error text is
258      * written to stderr if invalid input is encountered.
259      *
260      * A tuple is returned. First value is true if command line arguments were
261      * successfully processed and execution should continue, or false if an error
262      * occurred or the user asked for help. If false, the second value is the
263      * appropriate exit code (0 or 1).
264      *
265      * Returning true (execution continues) means args have been validated and derived
266      * values calculated. Field indices will have been converted to zero-based.
267      */
268     auto processArgs(ref string[] cmdArgs)
269     {
270         import std.algorithm : all, canFind, each, min;
271         import std.conv : to;
272         import std.file : exists, isDir;
273         import std.getopt;
274         import std.math : isNaN;
275         import std.path : baseName, expandTilde, extension, stripExtension;
276         import std.typecons : Yes, No;
277         import tsv_utils.common.fieldlist;
278 
279         bool helpVerbose = false;                  // --help-verbose
280         bool helpFields = false;                   // --help-fields
281         bool versionWanted = false;                // --V|version
282         string keyFieldsArg;                       // --k|key-fields
283 
284         string keyFieldsOptionString = "k|key-fields";
285 
286         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
287 
288         try
289         {
290             arraySep = ",";    // Use comma to separate values in command line options
291             auto r = getopt(
292                 cmdArgs,
293                 "help-verbose",    "     Print more detailed help.", &helpVerbose,
294                 "help-fields",     "     Print help on specifying fields.", &helpFields,
295 
296                 std.getopt.config.caseSensitive,
297                 "H|header",         "     Input files have a header line. Write the header to each output file.", &headerInOut,
298                 "I|header-in-only", "     Input files have a header line. Do not write the header to output files.", &headerIn,
299                 std.getopt.config.caseInsensitive,
300 
301                 "l|lines-per-file", "NUM  Number of lines to write to each output file (excluding the header line).", &linesPerFile,
302                 "n|num-files",      "NUM  Number of output files to generate.", &numFiles,
303 
304                 keyFieldsOptionString,
305                 "<field-list>  Fields to use as key. Lines with the same key are written to the same output file. Use '--k|key-fields 0' to use the entire line as the key.",
306                 &keyFieldsArg,
307 
308                 "dir",              "STR  Directory to write to. Default: Current working directory.", &dir,
309                 "prefix",           "STR  Filename prefix. Default: 'part_'", &prefix,
310                 "suffix",           "STR  Filename suffix. Default: First input file extension. None for standard input.", &suffix,
311                 "w|digit-width",    "NUM  Number of digits in filename numeric portion. Default: '--l|lines-per-file': 3. '--n|num-files': Chosen so filenames have the same length. '--w|digit-width 0' uses the default.", &digitWidth,
312                 "a|append",         "     Append to existing files.", &appendToExistingFiles,
313 
314                 "s|static-seed",    "     Use the same random seed every run.", &staticSeed,
315 
316                 std.getopt.config.caseSensitive,
317                 "v|seed-value",     "NUM  Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg,
318                 std.getopt.config.caseInsensitive,
319 
320                 "d|delimiter",      "CHR  Field delimiter.", &delim,
321                 "max-open-files",   "NUM  Maximum open file handles to use. Min of 5 required.", &maxOpenFilesArg,
322 
323                 std.getopt.config.caseSensitive,
324                 "V|version",        "     Print version information and exit.", &versionWanted,
325                 std.getopt.config.caseInsensitive,
326                 );
327 
328             if (r.helpWanted)
329             {
330                 defaultGetoptPrinter(helpText, r.options);
331                 return tuple(false, 0);
332             }
333             else if (helpVerbose)
334             {
335                 defaultGetoptPrinter(helpTextVerbose, r.options);
336                 return tuple(false, 0);
337             }
338             else if (helpFields)
339             {
340                 writeln(fieldListHelpText);
341                 return tuple(false, 0);
342             }
343             else if (versionWanted)
344             {
345                 import tsv_utils.common.tsvutils_version;
346                 writeln(tsvutilsVersionNotice("tsv-split"));
347                 return tuple(false, 0);
348             }
349 
350             /* Remaining command line args are files.
351              */
352             string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
353             cmdArgs.length = 1;
354 
355             /* Validation and derivations - Do as much validation prior to header line
356              * processing as possible (avoids waiting on stdin).
357              *
358              * Note: keyFields depends on header line processing, but keyFieldsArg
359              * can be used to detect whether the command line argument was specified.
360              */
361 
362             enforce(!(headerInOut && headerIn),
363                     "Use only one of '--H|header' and '--I|header-in-only'.");
364 
365             hasHeader = headerInOut || headerIn;
366 
367             enforce(linesPerFile != 0 || numFiles != 0,
368                     "Either '--l|lines-per-file' or '--n|num-files' is required.");
369 
370             enforce(linesPerFile == 0 || numFiles == 0,
371                     "'--l|lines-per-file' and '--n|num-files' cannot be used together.");
372 
373             enforce(linesPerFile == 0 || keyFieldsArg.length == 0,
374                     "'--l|lines-per-file' and '--k|key-fields' cannot be used together.");
375 
376             enforce(numFiles != 1, "'--n|num-files must be two or more.");
377 
378             if (!dir.empty)
379             {
380                 dir = dir.expandTilde;
381                 enforce(dir.exists, format("Directory does not exist: --dir '%s'", dir));
382                 enforce(dir.isDir, format("Path is not a directory: --dir '%s'", dir));
383             }
384 
385             /* Seed. */
386             import std.random : unpredictableSeed;
387 
388             usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0);
389 
390             if (usingUnpredictableSeed) seed = unpredictableSeed;
391             else if (seedValueOptionArg != 0) seed = seedValueOptionArg;
392             else if (staticSeed) seed = 2438424139;
393             else assert(0, "Internal error, invalid seed option states.");
394 
395             /* Maximum number of open files. Mainly applies when --num-files is used.
396              *
397              * Derive maxOpenOutputFiles. Inputs:
398              * - Internal default limit: 4096. This is a somewhat conservative setting.
399              * - rlimit open files limit. Defined by '$ ulimit -n'.
400              * - '--max-open-files' (maxOpenFilesArg). This adjusts the internal limit,
401              *   but only up to the rlimit value.
402              * - Four open files are reserved for stdin, stdout, stderr, and one input
403              *   file.
404              */
405 
406             immutable uint internalDefaultMaxOpenFiles = 4096;
407             immutable uint numReservedOpenFiles = 4;
408             immutable uint rlimitOpenFilesLimit = rlimitCurrOpenFilesLimit();
409 
410             enforce(maxOpenFilesArg == 0 || maxOpenFilesArg > numReservedOpenFiles,
411                     format("'--max-open-files' must be at least %d.",
412                            numReservedOpenFiles + 1));
413 
414             enforce(maxOpenFilesArg <= rlimitOpenFilesLimit,
415                     format("'--max-open-files' value (%d) greater current system limit (%d)." ~
416                            "\nRun 'ulimit -n' to see the soft limit." ~
417                            "\nRun 'ulimit -Hn' to see the hard limit." ~
418                            "\nRun 'ulimit -Sn NUM' to change the soft limit.",
419                            maxOpenFilesArg, rlimitOpenFilesLimit));
420 
421             enforce(rlimitOpenFilesLimit > numReservedOpenFiles,
422                     format("System open file limit too small. Current value: %d. Must be %d or more." ~
423                            "\nRun 'ulimit -n' to see the soft limit." ~
424                            "\nRun 'ulimit -Hn' to see the hard limit." ~
425                            "\nRun 'ulimit -Sn NUM' to change the soft limit.",
426                            rlimitOpenFilesLimit, numReservedOpenFiles + 1));
427 
428             immutable uint openFilesLimit =
429                 (maxOpenFilesArg != 0)
430                 ? maxOpenFilesArg
431                 : min(internalDefaultMaxOpenFiles, rlimitOpenFilesLimit);
432 
433             assert(openFilesLimit > numReservedOpenFiles);
434 
435             maxOpenOutputFiles = openFilesLimit - numReservedOpenFiles;
436 
437             /* Suffix - If not provided, use the extension of the first input file.
438              * No suffix if reading from standard input.
439              */
440             if (suffix == invalidFileSuffix) suffix = filepaths[0].extension;
441 
442             /* Ensure forward slash is not included in the filename prefix and suffix.
443              * Forward slash is an invalid Unix filename character. However, open file
444              * calls could match a directory path, resulting in unintended file
445              * creation.
446              *
447              * The other invalid filename character on Unix is the NULL character.
448              * However, the NULL character cannot be entered via Unix command lines,
449              * so there is no need to test for it explicitly.
450              */
451             enforce(!prefix.canFind('/'),
452                     "'--prefix' cannot contain forward slash characters. Use '--dir' to specify an output directory.");
453 
454             enforce(!suffix.canFind('/'),
455                     "'--suffix' cannot contain forward slash characters. Use '--dir' to specify an output directory.");
456 
457             /* Digit width - If not specified, or specified as zero, the width is
458              * determined by the number of files for --num-files, or defaulted to 3
459              * for --lines-per-file.
460              */
461             if (digitWidth == 0)
462             {
463                 if (numFiles > 0)
464                 {
465                     digitWidth = 1;
466                     uint n = numFiles - 1;
467                     while (n >= 10)
468                     {
469                         n /= 10;
470                         ++digitWidth;
471                     }
472                 }
473                 else
474                 {
475                     digitWidth = 3;
476                 }
477             }
478             assert(digitWidth != 0);
479 
480             /*
481              * Create the inputSourceRange and perform header line processing.
482              */
483             ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
484             inputSources = inputSourceRange(filepaths, readHeader);
485 
486             string[] headerFields;
487 
488             if (hasHeader) headerFields = inputSources.front.header.split(delim).to!(string[]);
489 
490             if (!keyFieldsArg.empty)
491             {
492                 keyFields =
493                     keyFieldsArg
494                     .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)
495                     (hasHeader, headerFields, keyFieldsOptionString)
496                     .array;
497             }
498 
499             if (keyFields.length > 0)
500             {
501                 if (keyFields.length == 1 && keyFields[0] == 0)
502                 {
503                     keyIsFullLine = true;
504                 }
505                 else
506                 {
507                     enforce(keyFields.all!(x => x != 0),
508                             "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.");
509 
510                     keyFields.each!((ref x) => --x);  // Convert to zero-based indexing.
511                 }
512             }
513 
514         }
515         catch (Exception exc)
516         {
517             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
518             return tuple(false, 1);
519         }
520         return tuple(true, 0);
521     }
522 }
523 
524 /* TsvSplitOptions unit tests (command-line argument processing).
525  *
526  * Basic tests. Many cases are covered in executable tests, including all error cases,
527  * as errors write to stderr.
528  */
529 unittest
530 {
531     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
532     import std.conv : to;
533     import std.file : mkdir, rmdirRecurse;
534     import std.path : buildPath;
535 
536     /* A dummy file is used so we don't have to worry about the cases where command
537      * line processing might open a file. Don't want to use standard input for this,
538      * at least in cases where it might try to read to get the header line.
539      */
540     auto testDir = makeUnittestTempDir("tsv_split_bylinecount");
541     scope(exit) testDir.rmdirRecurse;
542 
543     string somefile_txt = buildPath(testDir, "somefile.txt");
544     somefile_txt.File("w").writeln("Hello World!");
545 
546     {
547         auto args = ["unittest", "--lines-per-file", "10", somefile_txt];
548         TsvSplitOptions cmdopt;
549         const r = cmdopt.processArgs(args);
550 
551         assert(cmdopt.linesPerFile == 10);
552         assert(cmdopt.keyFields.empty);
553         assert(cmdopt.numFiles == 0);
554         assert(cmdopt.hasHeader == false);
555     }
556     {
557         auto args = ["unittest", "--num-files", "20", somefile_txt];
558         TsvSplitOptions cmdopt;
559         const r = cmdopt.processArgs(args);
560 
561         assert(cmdopt.linesPerFile == 0);
562         assert(cmdopt.keyFields.empty);
563         assert(cmdopt.numFiles == 20);
564         assert(cmdopt.hasHeader == false);
565     }
566     {
567         auto args = ["unittest", "-n", "5", "--key-fields", "1-3", somefile_txt];
568         TsvSplitOptions cmdopt;
569         const r = cmdopt.processArgs(args);
570 
571         assert(cmdopt.linesPerFile == 0);
572         assert(cmdopt.keyFields == [0, 1, 2]);
573         assert(cmdopt.numFiles == 5);
574         assert(cmdopt.hasHeader == false);
575         assert(cmdopt.keyIsFullLine == false);
576     }
577     {
578         auto args = ["unittest", "-n", "5", "-k", "0", somefile_txt];
579         TsvSplitOptions cmdopt;
580         const r = cmdopt.processArgs(args);
581 
582         assert(cmdopt.linesPerFile == 0);
583         assert(cmdopt.numFiles == 5);
584         assert(cmdopt.hasHeader == false);
585         assert(cmdopt.keyIsFullLine == true);
586     }
587     {
588         auto args = ["unittest", "-n", "2", "--header", somefile_txt];
589         TsvSplitOptions cmdopt;
590         const r = cmdopt.processArgs(args);
591 
592         assert(cmdopt.headerInOut == true);
593         assert(cmdopt.hasHeader == true);
594         assert(cmdopt.headerIn == false);
595     }
596     {
597         auto args = ["unittest", "-n", "2", "--header-in-only", somefile_txt];
598         TsvSplitOptions cmdopt;
599         const r = cmdopt.processArgs(args);
600 
601         assert(cmdopt.headerInOut == false);
602         assert(cmdopt.hasHeader == true);
603         assert(cmdopt.headerIn == true);
604     }
605 
606     static void testSuffix(string[] args, string expectedSuffix)
607     {
608         TsvSplitOptions cmdopt;
609         auto savedArgs = args.to!string;
610         const r = cmdopt.processArgs(args);
611 
612         assert(r[0], format("[testSuffix] cmdopt.processArgs(%s) returned false.", savedArgs));
613         assert(cmdopt.suffix == expectedSuffix,
614                format("[testSuffix] Incorrect cmdopt.suffix. Expected: '%s', Actual: '%s'\n   cmdopt.processArgs(%s)",
615                       expectedSuffix, cmdopt.suffix, savedArgs));
616     }
617 
618     /* In these tests, don't use headers and when files are listed, use 'somefile_txt' first.
619      * This makes sure there is no attempt to read standard input and that there won't be an
620      * open failure trying to find a file.
621      */
622     testSuffix(["unittest", "-n", "2"], "");
623     testSuffix(["unittest", "-n", "2", "--", "-"], "");
624     testSuffix(["unittest", "-n", "2", "--suffix", "_123"], "_123");
625     testSuffix(["unittest", "-n", "2", somefile_txt], ".txt");
626     testSuffix(["unittest", "-n", "2", somefile_txt, "anotherfile.pqr"], ".txt");
627     testSuffix(["unittest", "-n", "2", "--suffix", ".X", somefile_txt, "anotherfile.pqr"], ".X");
628     testSuffix(["unittest", "-n", "2", "--suffix", "", somefile_txt], "");
629     testSuffix(["unittest", "-n", "2", "--", "-", somefile_txt], "");
630     testSuffix(["unittest", "-n", "2", "--", somefile_txt, "-"], ".txt");
631 
632     static void testDigitWidth(string[] args, uint expected)
633     {
634         TsvSplitOptions cmdopt;
635         auto savedArgs = args.to!string;
636         const r = cmdopt.processArgs(args);
637 
638         assert(r[0], format("[testDigitWidth] cmdopt.processArgs(%s) returned false.", savedArgs));
639         assert(cmdopt.digitWidth == expected,
640                format("[testDigitWidth] Incorrect cmdopt.digitWidth. Expected: %d, Actual: %d\n   cmdopt.processArgs(%s)",
641                       expected, cmdopt.digitWidth, savedArgs));
642     }
643 
644     testDigitWidth(["unittest", "-n", "2", somefile_txt], 1);
645     testDigitWidth(["unittest", "-n", "2", "--digit-width" , "0", somefile_txt], 1);
646     testDigitWidth(["unittest", "-n", "10", somefile_txt], 1);
647     testDigitWidth(["unittest", "-n", "11", somefile_txt], 2);
648     testDigitWidth(["unittest", "-n", "555", somefile_txt], 3);
649     testDigitWidth(["unittest", "-n", "555", "--digit-width" , "2", somefile_txt], 2);
650     testDigitWidth(["unittest", "-n", "555", "--digit-width" , "4", somefile_txt], 4);
651     testDigitWidth(["unittest", "-l", "10", somefile_txt], 3);
652     testDigitWidth(["unittest", "-l", "10000", somefile_txt], 3);
653     testDigitWidth(["unittest", "-l", "10000", "--digit-width", "0", somefile_txt], 3);
654     testDigitWidth(["unittest", "-l", "10000", "--digit-width", "1", somefile_txt], 1);
655     testDigitWidth(["unittest", "-l", "10000", "--digit-width", "5", somefile_txt], 5);
656 }
657 
658 /** Get the rlimit current number of open files the process is allowed.
659  *
660  * This routine returns the current soft limit on the number of open files the process
661  * is allowed. This is the number returned by the command: '$ ulimit -n'.
662  *
663  * This routine translates this value to a 'uint', as tsv-split uses 'uint' for
664  * tracking output files. The rlimit 'rlim_t' type is usually 'ulong' or 'long'.
665  * RLIM_INFINITY and any value larger than 'uint.max' is translated to 'uint.max'.
666  *
667  * An exception is thrown if call to 'getrlimit' fails.
668  */
669 uint rlimitCurrOpenFilesLimit()
670 {
671     import core.sys.posix.sys.resource :
672         rlim_t, rlimit, getrlimit, RLIMIT_NOFILE, RLIM_INFINITY, RLIM_SAVED_CUR;
673     import std.conv : to;
674 
675     uint currOpenFileLimit = uint.max;
676 
677     rlimit rlimitMaxOpenFiles;
678 
679     enforce(getrlimit(RLIMIT_NOFILE, &rlimitMaxOpenFiles) == 0,
680             "Internal error: getrlimit call failed");
681 
682     if (rlimitMaxOpenFiles.rlim_cur != RLIM_INFINITY &&
683         rlimitMaxOpenFiles.rlim_cur != RLIM_SAVED_CUR &&
684         rlimitMaxOpenFiles.rlim_cur >= 0 &&
685         rlimitMaxOpenFiles.rlim_cur <= uint.max)
686     {
687         currOpenFileLimit = rlimitMaxOpenFiles.rlim_cur.to!uint;
688     }
689 
690     return currOpenFileLimit;
691 }
692 
693 /** Invokes the proper split routine based on the command line arguments.
694  *
695  * This routine is the top-level control after command line argument processing is
696  * done. It's primary job is to set up data structures and invoke the correct
697  * processing routine based on the command line arguments.
698  */
699 void tsvSplit(ref TsvSplitOptions cmdopt)
700 {
701     /* Check that the input files were setup as expected. Should at least have one
702      * input, stdin if nothing else. */
703     assert(!cmdopt.inputSources.empty);
704 
705     if (cmdopt.linesPerFile != 0)
706     {
707         splitByLineCount(cmdopt);
708     }
709     else
710     {
711         /* Randomly distribute input lines to a specified number of files. */
712 
713         auto outputFiles =
714             SplitOutputFiles(cmdopt.numFiles, cmdopt.dir, cmdopt.prefix, cmdopt.suffix,
715                              cmdopt.digitWidth, cmdopt.headerInOut, cmdopt.maxOpenOutputFiles,
716                              cmdopt.inputSources.front.header);
717 
718         if (!cmdopt.appendToExistingFiles)
719         {
720             string existingFile = outputFiles.checkIfFilesExist;
721             enforce(existingFile.length == 0,
722                     format("One or more output files already exist. Use '--a|append' to append to existing files. File: '%s'.",
723                            existingFile));
724         }
725 
726         if (cmdopt.keyFields.length == 0)
727         {
728             splitLinesRandomly(cmdopt, outputFiles);
729         }
730         else
731         {
732             splitLinesByKey(cmdopt, outputFiles);
733         }
734     }
735 }
736 
737 /** A SplitOutputFiles struct holds a collection of output files.
738  *
739  * This struct manages a collection of output files used when writing to multiple
740  * files at once. This includes constructing filenames, opening and closing files,
741  * and writing data and header lines.
742  *
743  * Both random assignment (splitLinesRandomly) and random assignment by key
744  * (splitLinesByKey) use a SplitOutputFiles struct to manage output files.
745  *
746  * The main properties of the output file set are specified in the constuctor. The
747  * exception is the header line. This is not known until the first input file is
748  * read, so it is specified in a separate 'setHeader' call.
749  *
750  * Individual output files are written to based on their zero-based index in the
751  * output collection. The caller selects the output file number to write to and
752  * calls 'writeDataLine' to write a line. The header is written if needed.
753  */
754 struct SplitOutputFiles
755 {
756     import std.conv : to;
757     import std.file : exists;
758     import std.path : buildPath;
759     import std.stdio : File;
760 
761     static struct OutputFile
762     {
763         string filename;
764         File ofile;
765         bool hasData;
766         bool isOpen;    // Track separately due to https://github.com/dlang/phobos/pull/7397
767     }
768 
769     private uint _numFiles;
770     private bool _writeHeaders;
771     private uint _maxOpenFiles;
772 
773     private OutputFile[] _outputFiles;
774     private uint _numOpenFiles = 0;
775     private string _header;
776 
777     this(uint numFiles, string dir, string filePrefix, string fileSuffix,
778          uint fileDigitWidth, bool writeHeaders, uint maxOpenFiles, string header)
779     {
780         assert(numFiles >= 2);
781         assert(maxOpenFiles >= 1);
782 
783         _numFiles = numFiles;
784         _writeHeaders = writeHeaders;
785         _maxOpenFiles = maxOpenFiles;
786         _header = header;
787 
788         _outputFiles.length = numFiles;
789 
790         /* Filename assignment. */
791         foreach (i, ref f; _outputFiles)
792         {
793             f.filename =
794                 buildPath(dir, format("%s%.*d%s", filePrefix, fileDigitWidth, i, fileSuffix));
795         }
796     }
797 
798     /* Destructor ensures all files are closed.
799      *
800      * Note: A dual check on whether the file is open is made. This is to avoid a
801      * Phobos bug where std.File doesn't properly maintain the state of open files
802      * if the File.open call fails. See: https://github.com/dlang/phobos/pull/7397.
803      */
804     ~this()
805     {
806         foreach (ref f; _outputFiles)
807         {
808             if (f.isOpen && f.ofile.isOpen)
809             {
810                 assert(_numOpenFiles >= 1);
811 
812                 f.ofile.close;
813                 f.isOpen = false;
814                 _numOpenFiles--;
815             }
816         }
817     }
818 
819     /* Check if any of the files already exist.
820      *
821      * Returns the empty string if none of the files exist. Otherwise returns the
822      * filename of the first existing file found. This is to facilitate error
823      * message generation.
824      */
825     string checkIfFilesExist()
826     {
827         foreach (f; _outputFiles) if (f.filename.exists) return f.filename;
828         return "";
829     }
830 
831     /* Picks a random file to close. Used when the open file handle limit has been
832      * reached.
833      */
834     private void closeSomeFile()
835     {
836         import std.random : uniform;
837         assert(_numOpenFiles > 0);
838 
839         immutable uint start = uniform(0, _numFiles);
840 
841         foreach (i; cycle(iota(_numFiles), start).take(_numFiles))
842         {
843             if (_outputFiles[i].isOpen)
844             {
845                 _outputFiles[i].ofile.close;
846                 _outputFiles[i].isOpen = false;
847                 _numOpenFiles--;
848 
849                 return;
850             }
851         }
852 
853         assert(false, "[SplitOutputFiles.closeSomeFile]: Could not find file to close.");
854     }
855 
856     /* Write a line to the specified file number.
857      *
858      * A header is written to the file if headers are being written and this is the
859      * first data written to the file.
860      */
861     void writeDataLine(uint fileNum, const char[] data)
862     {
863         assert(fileNum < _numFiles);
864         assert(fileNum < _outputFiles.length);
865         assert(_numOpenFiles <= _maxOpenFiles);
866 
867         OutputFile* outputFile = &_outputFiles[fileNum];
868 
869         if (!outputFile.isOpen)
870         {
871             if (_numOpenFiles == _maxOpenFiles) closeSomeFile();
872             assert(_numOpenFiles < _maxOpenFiles);
873 
874             outputFile.ofile = outputFile.filename.File("a");
875             outputFile.isOpen = true;
876             _numOpenFiles++;
877 
878             if (!outputFile.hasData)
879             {
880                 ulong filesize = outputFile.ofile.size;
881                 outputFile.hasData = (filesize > 0 && filesize != ulong.max);
882             }
883         }
884 
885         if (_writeHeaders && !outputFile.hasData) outputFile.ofile.writeln(_header);
886 
887         outputFile.ofile.writeln(data);
888         outputFile.hasData = true;
889     }
890 }
891 
892 /** Write input lines to multiple files, randomly selecting an output file for each line.
893  */
894 void splitLinesRandomly(ref TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles)
895 {
896     import std.random : Random = Mt19937, uniform;
897     import tsv_utils.common.utils : bufferedByLine, InputSourceRange;
898 
899     /* inputSources must be an InputSourceRange and include at least stdin. */
900     assert(!cmdopt.inputSources.empty);
901     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
902 
903     auto randomGenerator = Random(cmdopt.seed);
904 
905     /* Process each line. */
906     foreach (inputStream; cmdopt.inputSources)
907     {
908         foreach (line; inputStream.file.bufferedByLine)
909         {
910             immutable uint outputFileNum = uniform(0, cmdopt.numFiles, randomGenerator);
911             outputFiles.writeDataLine(outputFileNum, line);
912         }
913     }
914 }
915 
916 /** Write input lines to multiple output files using fields as a random selection key.
917  *
918  * Each input line is written to an output file. The output file is chosen using
919  * fields as a key. Each unique key is assigned to a file. All lines having the
920  * same key are written to the same file.
921  */
922 void splitLinesByKey(ref TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles)
923 {
924     import std.algorithm : splitter;
925     import std.conv : to;
926     import std.digest.murmurhash;
927     import tsv_utils.common.utils : bufferedByLine, InputFieldReordering,
928         InputSourceRange, throwIfWindowsNewlineOnUnix;
929 
930     assert(cmdopt.keyFields.length > 0);
931 
932     /* inputSources must be an InputSourceRange and include at least stdin. */
933     assert(!cmdopt.inputSources.empty);
934     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
935 
936     immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys.
937 
938     /* Create a mapping for the key fields. */
939     auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields);
940 
941     /* Process each line. */
942     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
943     foreach (inputStream; cmdopt.inputSources)
944     {
945         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
946 
947         foreach (fileLineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine))
948         {
949             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
950 
951             /* Murmurhash works by successively adding individual keys, then finalizing.
952              * Adding individual keys is simpler if the full-line-as-key and individual
953              * fields as keys cases are separated.
954              */
955             auto hasher = MurmurHash3!32(cmdopt.seed);
956 
957             if (cmdopt.keyIsFullLine)
958             {
959                 hasher.put(cast(ubyte[]) line);
960             }
961             else
962             {
963                 assert(keyFieldsReordering !is null);
964 
965                 /* Gather the key field values and assemble the key. */
966                 keyFieldsReordering.initNewLine;
967                 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
968                 {
969                     keyFieldsReordering.processNextField(fieldIndex, fieldValue);
970                     if (keyFieldsReordering.allFieldsFilled) break;
971                 }
972 
973                 enforce(keyFieldsReordering.allFieldsFilled,
974                         format("Not enough fields in line. File: %s, Line: %s",
975                                inputStream.name, fileLineNum));
976 
977                 foreach (count, key; keyFieldsReordering.outputFields.enumerate)
978                 {
979                     if (count > 0) hasher.put(delimArray);
980                     hasher.put(cast(ubyte[]) key);
981                 }
982             }
983 
984             hasher.finish;
985             immutable uint outputFileNum = hasher.get % cmdopt.numFiles;
986             outputFiles.writeDataLine(outputFileNum, line);
987         }
988     }
989 }
990 
991 /** Write input lines to multiple files, splitting based on line count.
992  *
993  * Note: readBufferSize is an argument primarily for unit test purposes. Normal uses
994  * should use the default value.
995  */
996 void splitByLineCount(ref TsvSplitOptions cmdopt, const size_t readBufferSize = 1024L * 128L)
997 {
998     import std.file : exists;
999     import std.path : buildPath;
1000     import std.stdio : File;
1001     import tsv_utils.common.utils : InputSourceRange;
1002 
1003     assert (readBufferSize > 0);
1004     ubyte[] readBuffer = new ubyte[readBufferSize];
1005 
1006     /* inputSources must be an InputSourceRange and include at least stdin. */
1007     assert(!cmdopt.inputSources.empty);
1008     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1009 
1010     string header = !cmdopt.headerInOut ? "" :
1011         cmdopt.inputSources.front.header(Yes.keepTerminator);
1012     size_t nextOutputFileNum = 0;
1013     File outputFile;
1014     string outputFileName;
1015     bool isOutputFileOpen = false;           // Open file status tracked separately due to phobos bugs
1016     size_t outputFileRemainingLines;
1017 
1018     /* nextNewlineIndex finds the index of the next newline character. It is an
1019      * alternative to std.algorithm.countUntil. Invoking 'find' directly results
1020      * 'memchr' being used (faster). The current 'countUntil' implementation does
1021      * forward to find, but the way it is done avoids the memchr call optimization.
1022      */
1023     static long nextNewlineIndex(const ubyte[] buffer)
1024     {
1025         import std.algorithm : find;
1026         immutable ubyte newlineChar = '\n';
1027         immutable size_t buflen = buffer.length;
1028         immutable size_t findlen = buffer.find(newlineChar).length;
1029 
1030         return findlen > 0 ? buflen - findlen : -1;
1031     }
1032 
1033     foreach (inputStream; cmdopt.inputSources)
1034     {
1035         foreach (ref ubyte[] inputChunk; inputStream.file.byChunk(readBuffer))
1036         {
1037             size_t nextOutputChunkStart = 0;
1038             auto remainingInputChunk = inputChunk[nextOutputChunkStart .. $];
1039 
1040             while (!remainingInputChunk.empty)
1041             {
1042                 /* See if the next output file needs to be opened. */
1043                 if (!isOutputFileOpen)
1044                 {
1045                     outputFileName =
1046                         buildPath(cmdopt.dir,
1047                                   format("%s%.*d%s", cmdopt.prefix,
1048                                          cmdopt.digitWidth, nextOutputFileNum, cmdopt.suffix));
1049 
1050                     enforce(cmdopt.appendToExistingFiles || !outputFileName.exists,
1051                             format("Output file already exists. Use '--a|append' to append to existing files. File: '%s'.",
1052                                    outputFileName));
1053 
1054                     outputFile = outputFileName.File("ab");
1055                     outputFile.setvbuf(1024L * 64L, _IOFBF);
1056                     isOutputFileOpen = true;
1057                     ++nextOutputFileNum;
1058                     outputFileRemainingLines = cmdopt.linesPerFile;
1059 
1060                     if (cmdopt.headerInOut)
1061                     {
1062                         ulong filesize = outputFile.size;
1063                         if (filesize == 0 || filesize == ulong.max) outputFile.rawWrite(header);
1064                     }
1065                 }
1066 
1067                 /* Find more newlines for the current output file. */
1068 
1069                 assert(outputFileRemainingLines > 0);
1070 
1071                 size_t nextOutputChunkEnd = nextOutputChunkStart;
1072 
1073                 while (outputFileRemainingLines != 0 && !remainingInputChunk.empty)
1074                 {
1075                     /* Note: newLineIndex is relative to 'remainingInputChunk', not
1076                      * 'inputChunk'. Updates to variables referring to 'inputChunk'
1077                      * need to reflect this. In particular, 'nextOutputChunkEnd'.
1078                      */
1079                     immutable newlineIndex = nextNewlineIndex(remainingInputChunk);
1080 
1081                     if (newlineIndex == -1)
1082                     {
1083                         nextOutputChunkEnd = inputChunk.length;
1084                     }
1085                     else
1086                     {
1087                         --outputFileRemainingLines;
1088                         nextOutputChunkEnd += (newlineIndex + 1);
1089                     }
1090 
1091                     remainingInputChunk = inputChunk[nextOutputChunkEnd .. $];
1092                 }
1093 
1094                 assert(nextOutputChunkStart < nextOutputChunkEnd);
1095                 assert(nextOutputChunkEnd <= inputChunk.length);
1096 
1097                 outputFile.rawWrite(inputChunk[nextOutputChunkStart .. nextOutputChunkEnd]);
1098 
1099                 if (outputFileRemainingLines == 0)
1100                 {
1101                     outputFile.close;
1102                     isOutputFileOpen = false;
1103                 }
1104 
1105                 nextOutputChunkStart = nextOutputChunkEnd;
1106 
1107                 assert(remainingInputChunk.length == inputChunk.length - nextOutputChunkStart);
1108             }
1109         }
1110     }
1111 }
1112 
1113 /* splitByLineCount unit tests.
1114  *
1115  * These tests are primarily for buffer management. There are edge cases involving the
1116  * interaction buffer size, input file size, lines-per-file, and newline placement
1117  * that are difficult to test against the executable.
1118  */
1119 unittest
1120 {
1121     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
1122     import std.algorithm : min;
1123     import std.array : appender;
1124     import std.conv : to;
1125     import std.file : exists, mkdir, rmdirRecurse;
1126     import std.path : buildPath;
1127     import std.process : escapeShellCommand, executeShell;
1128 
1129     /* Test setup
1130      *
1131      * A set of twenty file input files is created, with names: input_NxM.txt, where
1132      * N is the number of characters in each row and M is the number of rows (lines).
1133      * The resulting files are put in the "lc_input" directory ('inputDir' variable)
1134      * and have names:
1135      *    input_0x2.txt, input_0x3.txt, ... input_5x5.txt.
1136      *
1137      * A standalone block of code produces the expected result files for splitting an
1138      * input file into a set of output files. This duplicates the splitByLineCount
1139      * output. This is done for lines-per-file counts 1 to 5. Each result set is place
1140      * ina subdirectory under "lc_expected" ('expectedDir' variable). Subdirectories
1141      * have names like: "0x2_by_1", "0x3_by_1", ..., "5x5_by_4".
1142      *
1143      * splitByLine is called for all the same input files and lines-per-file settings used
1144      * to produce the expected output. This is done via testSplitByLineCount, which calls
1145      * command line argument processing and splitByLine, similar to how the main program
1146      * works. The results are written to a subdirectory. The subdirectory is compared to
1147      * the expected output directory using the system 'diff' command.
1148      *
1149      * splitByLine is multiple times for each expected output case. The different calls
1150      * iterate over a series of small ReadBufferSizes. This is how tests for edge cases
1151      * in the readBufferSize vs line lengths, newline placement, etc., is accomplished.
1152      *
1153      * Note: One way to understand what is going on is to comment out the line:
1154      *
1155      *    scope(exit) testDir.rmdirRecurse;
1156      *
1157      * Then run the test (e.g. 'make test') and look at the directory structure left
1158      * behind. Print out the 'testDir' directory to see where it is located.
1159      */
1160 
1161     /* testSplitByLineCount acts as a surrogate for main() and tsvSplit(). It makes the
1162      * call to splitByLineCount and calls 'diff' to compare the output directory to the
1163      * expected directory. An assert is thrown if the directories do not match.
1164      */
1165     static void testSplitByLineCount(string[] cmdArgs, string expectedDir,
1166                                  size_t readBufferSize = 1024L * 512L)
1167     {
1168         import std.array : appender;
1169 
1170         assert(cmdArgs.length > 0, "[testSplitByLineCount] cmdArgs must not be empty.");
1171 
1172         auto formatAssertMessage(T...)(string msg, T formatArgs)
1173         {
1174             auto formatString = "[testSplitByLineCount] %s: " ~ msg;
1175             return format(formatString, cmdArgs[0], formatArgs);
1176         }
1177 
1178         TsvSplitOptions cmdopt;
1179         auto savedCmdArgs = cmdArgs.to!string;
1180         auto r = cmdopt.processArgs(cmdArgs);
1181         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
1182         assert(cmdopt.linesPerFile != 0, "[testSplitByLineCount] --lines-per-file is required.");
1183         assert(!cmdopt.dir.empty, "[testSplitByLineCount] --dir is required.");
1184 
1185         splitByLineCount(cmdopt, readBufferSize);
1186 
1187         /* Diff command setup. */
1188         auto diffCmdArgs = ["diff", expectedDir, cmdopt.dir];
1189         auto diffResult = executeShell(escapeShellCommand(diffCmdArgs));
1190         assert(diffResult.status == 0,
1191                format("[testSplitByLineCount]\n  cmd: %s\n  readBufferSize: %d\n  expectedDir: %s\n------ Diff ------%s\n-------",
1192                       savedCmdArgs, readBufferSize, expectedDir, diffResult.output));
1193     }
1194 
1195     auto testDir = makeUnittestTempDir("tsv_split_bylinecount");
1196     scope(exit) testDir.rmdirRecurse;
1197 
1198     auto inputDir = buildPath(testDir, "lc_input");
1199     auto outputDir = buildPath(testDir, "lc_output");
1200     auto expectedDir = buildPath(testDir, "lc_expected");
1201 
1202     mkdir(inputDir);
1203     mkdir(outputDir);
1204     mkdir(expectedDir);
1205 
1206     static string buildInputFilePath(string dir, long inputLineLength, long inputFileNumLines)
1207     {
1208         return buildPath(dir, format("input_%dx%d.txt", inputLineLength, inputFileNumLines));
1209     }
1210 
1211     string[5] outputRowData =
1212         [
1213             "abcde",
1214             "fghij",
1215             "klmno",
1216             "pqrst",
1217             "uvwxy"
1218         ];
1219 
1220     /* The main test loop. Iterates over input line lengths, numbers of rows,
1221      * lines-per-file, and finally readBufferSize lengths. All combos are tested.
1222      */
1223     foreach (inputLineLength; 0 .. 6)
1224     {
1225         foreach (inputFileNumLines; 2 .. 6)
1226         {
1227             auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines);
1228 
1229             {
1230                 auto ofile = inputFile.File("w");
1231                 auto output = appender!(char[])();
1232                 foreach (m; 0 .. inputFileNumLines)
1233                 {
1234                     put(output, outputRowData[m][0 .. inputLineLength]);
1235                     put(output, '\n');
1236                 }
1237                 ofile.write(output.data);
1238                 ofile.close;
1239             }
1240 
1241             /* Iterate over the different lines-per-file lengths.
1242              * - Create an expected output directory and files for each.
1243              * - Test with different readBufferSize values.
1244              */
1245             foreach (outputFileNumLines; 1 .. min(5, inputFileNumLines))
1246             {
1247                 auto expectedSubDir =
1248                     buildPath(expectedDir, format("%dx%d_by_%d", inputLineLength,
1249                                                   inputFileNumLines, outputFileNumLines));
1250                 mkdir(expectedSubDir);
1251 
1252                 size_t filenum = 0;
1253                 size_t linesWritten = 0;
1254                 while (linesWritten < inputFileNumLines)
1255                 {
1256                     auto expectedFile = buildPath(expectedSubDir, format("part_%d.txt", filenum));
1257                     auto f = expectedFile.File("w");
1258                     auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten);
1259                     foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite])
1260                     {
1261                         f.writeln(line[0 .. inputLineLength]);
1262                     }
1263                     linesWritten += linesToWrite;
1264                     ++filenum;
1265                     f.close;
1266                 }
1267 
1268                 /* Test the different readBufferSizes.
1269                  * - An output directory is created for the run and deleted afterward.
1270                  * - First test the default size.
1271                  * - Then iterate overs small readBufferSize values.
1272                  */
1273                 auto outputSubDir =
1274                     buildPath(outputDir, format("%dx%d_by_%d", inputLineLength,
1275                                                 inputFileNumLines, outputFileNumLines));
1276                 mkdir(outputSubDir);
1277 
1278                 testSplitByLineCount(
1279                     ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir,
1280                      "--digit-width", "1", inputFile],
1281                     expectedSubDir);
1282 
1283                 outputSubDir.rmdirRecurse;
1284 
1285                 foreach (readBufSize; 1 .. 8)
1286                 {
1287                      mkdir(outputSubDir);
1288 
1289                      testSplitByLineCount(
1290                          ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir,
1291                           "--digit-width", "1", inputFile],
1292                          expectedSubDir, readBufSize);
1293 
1294                      outputSubDir.rmdirRecurse;
1295                 }
1296             }
1297         }
1298     }
1299 
1300     {
1301         /* Tests for the special case where readBufferSize is smaller than the header
1302          * line. We'll reuse the input_5x4.txt input file and write 1 line-per-file.
1303          */
1304         immutable inputLineLength = 5;
1305         immutable inputFileNumLines = 4;
1306         immutable outputFileNumLines = 1;
1307 
1308         auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines);
1309         assert(inputFile.exists);
1310 
1311         auto expectedSubDirHeader =
1312             buildPath(expectedDir, format("%dx%d_by_%d_header", inputLineLength,
1313                                           inputFileNumLines, outputFileNumLines));
1314 
1315         auto expectedSubDirHeaderInOnly =
1316             buildPath(expectedDir, format("%dx%d_by_%d_header_in_only", inputLineLength,
1317                                           inputFileNumLines, outputFileNumLines));
1318 
1319         mkdir(expectedSubDirHeader);
1320         mkdir(expectedSubDirHeaderInOnly);
1321 
1322         /* Generate the expected results. Cheat by starting with linesWritten = 1. This
1323          * automatically excludes the header line, but keeps the loop code consistent
1324          * with the main test loop.
1325          */
1326         size_t filenum = 0;
1327         size_t linesWritten = 1;
1328         while (linesWritten < inputFileNumLines)
1329         {
1330             auto expectedFileHeader = buildPath(expectedSubDirHeader, format("part_%d.txt", filenum));
1331             auto expectedFileHeaderInOnly = buildPath(expectedSubDirHeaderInOnly,
1332                                                       format("part_%d.txt", filenum));
1333             auto fHeader = expectedFileHeader.File("w");
1334             auto fHeaderInOnly = expectedFileHeaderInOnly.File("w");
1335             auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten);
1336 
1337             fHeader.writeln(outputRowData[0][0 .. inputLineLength]);
1338             foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite])
1339             {
1340                 fHeader.writeln(line[0 .. inputLineLength]);
1341                 fHeaderInOnly.writeln(line[0 .. inputLineLength]);
1342             }
1343             linesWritten += linesToWrite;
1344             ++filenum;
1345             fHeader.close;
1346             fHeaderInOnly.close;
1347         }
1348 
1349         /* Now run the tests. */
1350         auto outputSubDirHeader =
1351             buildPath(outputDir, format("%dx%d_by_%d_header", inputLineLength,
1352                                         inputFileNumLines, outputFileNumLines));
1353         auto outputSubDirHeaderInOnly =
1354             buildPath(outputDir, format("%dx%d_by_%d_header_in_only", inputLineLength,
1355                                         inputFileNumLines, outputFileNumLines));
1356 
1357         foreach (readBufSize; 1 .. 6)
1358         {
1359             mkdir(outputSubDirHeader);
1360             mkdir(outputSubDirHeaderInOnly);
1361 
1362             testSplitByLineCount(
1363                 ["test", "--header", "--lines-per-file", outputFileNumLines.to!string,
1364                  "--dir", outputSubDirHeader, "--digit-width", "1", inputFile],
1365                 expectedSubDirHeader, readBufSize);
1366 
1367             testSplitByLineCount(
1368                 ["test", "--header-in-only", "--lines-per-file", outputFileNumLines.to!string,
1369                  "--dir", outputSubDirHeaderInOnly, "--digit-width", "1", inputFile],
1370                 expectedSubDirHeaderInOnly, readBufSize);
1371 
1372             outputSubDirHeader.rmdirRecurse;
1373             outputSubDirHeaderInOnly.rmdirRecurse;
1374         }
1375     }
1376 }