tsv_utils.tsv_split source code

1 /**
2 Command line tool for splitting a files (or files) into multiple output files.
3 Several methods for splitting are available, including splitting by line count,
4 splitting by random assignment, and splitting by random assignment based on
5 key fields.
6 
7 Copyright (c) 2020, eBay Inc.
8 Initially written by Jon Degenhardt
9 
10 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
11 */
12 module tsv_utils.tsv_split;
13 
14 import std.exception : enforce;
15 import std.format : format;
16 import std.range;
17 import std.stdio;
18 import std.typecons : tuple, Flag;
19 
20 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
21 
22 version(unittest)
23 {
24     // When running unit tests, use main from -main compiler switch.
25 }
26 else
27 {
28     /** Main program.
29      *
30      * Invokes command line argument processing and calls tsvSplit to do the real
31      * work. Errors occurring during processing are caught and reported to the user.
32      */
33     int main(string[] cmdArgs)
34     {
35         /* When running in DMD code coverage mode, turn on report merging. */
36         version(D_Coverage) version(DigitalMars)
37         {
38             import core.runtime : dmd_coverSetMerge;
39             dmd_coverSetMerge(true);
40         }
41 
42         TsvSplitOptions cmdopt;
43         const r = cmdopt.processArgs(cmdArgs);
44         if (!r[0]) return r[1];
45         version(LDC_Profile)
46         {
47             import ldc.profile : resetAll;
48             resetAll();
49         }
50         try
51         {
52             tsvSplit(cmdopt);
53         }
54         catch (Exception exc)
55         {
56             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
57             return 1;
58         }
59         return 0;
60     }
61 }
62 
63 immutable helpText = q"EOS
64 Synopsis: tsv-split [options] [file...]
65 
66 Split input lines into multiple output files. There are three modes of
67 operation:
68 
69 * Fixed number of lines per file (--l|lines-per-file NUM): Each input
70   block of NUM lines is written to a new file. Similar to Unix 'split'.
71 
72 * Random assignment (--n|num-files NUM): Each input line is written to a
73   randomly selected output file. Random selection is from NUM files.
74 
75 * Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS):
76   Input lines are written to output files using fields as a key. Each
77   unique key is randomly assigned to one of NUM output files. All lines
78   with the same key are written to the same file.
79 
80 By default, files are written to the current directory and have names
81 of the form 'part_NNN<suffix>', with 'NNN' being a number and <suffix>
82 being the extension of the first input file. If the input file is
83 'file.txt', the names will take the form 'part_NNN.txt'. The output
84 directory and file names are customizable.
85 
86 Use '--help-verbose' for more detailed information.
87 
88 Options:
89 EOS";
90 
91 immutable helpTextVerbose = q"EOS
92 Synopsis: tsv-split [options] [file...]
93 
94 Split input lines into multiple output files. There are three modes of
95 operation:
96 
97 * Fixed number of lines per file (--l|lines-per-file NUM): Each input
98   block of NUM lines is written to a new file. Similar to Unix 'split'.
99 
100 * Random assignment (--n|num-files NUM): Each input line is written to a
101   randomly selected output file. Random selection is from NUM files.
102 
103 * Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS):
104   Input lines are written to output files using fields as a key. Each
105   unique key is randomly assigned to one of NUM output files. All lines
106   with the same key are written to the same file.
107 
108 Output files: By default, files are written to the current directory and
109 have names of the form 'part_NNN<suffix>', with 'NNN' being a number and
110 <suffix> being the extension of the first input file. If the input file is
111 'file.txt', the names will take the form 'part_NNN.txt'. The suffix is
112 empty when reading from standard input. The numeric part defaults to 3
113 digits for '--l|lines-per-files'. For '--n|num-files' enough digits are
114 used so all filenames are the same length. The output directory and file
115 names are customizable.
116 
117 Header lines: There are two ways to handle input with headers: write a
118 header to all output files (--H|header), or exclude headers from all
119 output files ('--I|header-in-only'). The best choice depends on the
120 follow-up processing. All tsv-utils tools support header lines in multiple
121 input files, but many other tools do not. For example, GNU parallel works
122 best on files without header lines.
123 
124 Random assignment (--n|num-files): Random distribution of records to a set
125 of files is a common task. When data fits in memory the preferred approach
126 is usually to shuffle the data and split it into fixed sized blocks. E.g.
127 'tsv-sample data.tsv | tsv-split -l NUM'. However, alternate approaches
128 are needed when data is too large for convenient shuffling. tsv-split's
129 random assignment feature is useful in this case. Each input line is
130 written to a randomly selected output file. Note that output files will
131 have similar but not identical numbers of records.
132 
133 Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): This
134 splits a data set into multiple files sharded by key. All lines with the
135 same key are written to the same file. This partitioning enables parallel
136 computation based on the key. For example, statistical calculation
137 ('tsv-summarize --group-by') or duplicate removal ('tsv-uniq --fields').
138 These operations can be parallelized using tools like GNU parallel, which
139 simplifies concurrent operations on multiple files.
140 
141 Random seed: By default, each tsv-split invocation using random assignment
142 or random assignment by key produces different assignments to the output
143 files. Using '--s|static-seed' changes this so multiple runs produce the
144 same assignments. This works by using the same random seed each run. The
145 seed can be specified using '--v|seed-value'.
146 
147 Appending to existing files: By default, an error is triggered if an
148 output file already exists. '--a|append' changes this so that lines are
149 appended to existing files. (Header lines are not appended to files with
150 data.) This is useful when adding new data to files created by a previous
151 tsv-split run. Random assignment should use the same '--n|num-files' value
152 each run, but different random seeds (avoid '--s|static-seed'). Random
153 assignment by key should use the same '--n|num-files', '--k|key-fields',
154 and seed ('--s|static-seed' or '--v|seed-value') each run.
155 
156 Max number of open files: Random assignment and random assignment by key
157 are dramatically faster when all output files are kept open. However,
158 keeping a large numbers of open files can bump into system limits or limit
159 resources available to other processes. By default, tsv-split uses up to
160 4096 open files or the system per-process limit, whichever is smaller.
161 This can be changed using '--max-open-files', though it cannot be set
162 larger than the system limit. The system limit varies considerably between
163 systems. On many systems it is unlimited. On MacOS it is often set to 256.
164 Use Unix 'ulimit' to display and modify the limits:
165 * 'ulimit -n' - Show the "soft limit". The per-process maximum.
166 * 'ulimit -Hn' - Show the "hard limit". The max allowed soft limit.
167 * 'ulimit -Sn NUM' - Change the "soft limit" to NUM.
168 
169 Examples:
170 
171   # Split a 10 million line file into 1000 files, 10,000 lines each.
172   # Output files are part_000.txt, part_001.txt, ... part_999.txt.
173   tsv-split data.txt --lines-per-file 10000
174 
175   # Same as the previous example, but write files to a subdirectory.
176   tsv-split data.txt --dir split_files --lines-per-file 10000
177 
178   # Split a file into 10,000 line files, writing a header line to each
179   tsv-split data.txt -H --lines-per-file 10000
180 
181   # Same as the previous example, but dropping the header line.
182   tsv-split data.txt -I --lines-per-file 10000
183 
184   # Randomly assign lines to 1000 files
185   tsv-split data.txt --num-files 1000
186 
187   # Randomly assign lines to 1000 files while keeping unique keys from
188   # field 3 together.
189   tsv-split data.tsv --num-files 1000 -k 3
190 
191   # Randomly assign lines to 1000 files. Later, randomly assign lines
192   # from a second data file to the same output files.
193   tsv-split data1.tsv -n 1000
194   tsv-split data2.tsv -n 1000 --append
195 
196   # Randomly assign lines to 1000 files using field 3 as a key.
197   # Later, add a second file to the same output files.
198   tsv-split data1.tsv -n 1000 -k 3 --static-seed
199   tsv-split data2.tsv -n 1000 -k 3 --static-seed --append
200 
201   # Change the system per-process open file limit for one command.
202   # The parens create a sub-shell. The current shell is not changed.
203   ( ulimit -Sn 1000 && tsv-split --num-files 1000 data.txt )
204 
205 Options:
206 EOS";
207 
208 /** Container for command line options and derived data.
209  *
210  * TsvSplitOptions handles several aspects of command line options. On the input side,
211  * it defines the command line options available, performs validation, and sets up any
212  * derived state based on the options provided. These activities are handled by the
213  * processArgs() member.
214  *
215  * Once argument processing is complete, TsvSplitOptions is used as a container
216  * holding the specific processing options used by the splitting algorithms.
217  */
218 struct TsvSplitOptions
219 {
220     import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader;
221 
222     enum invalidFileSuffix = "///////";
223 
224     string programName;                        /// Program name
225     InputSourceRange inputSources;             /// Input files
226     bool helpVerbose = false;                  /// --help-verbose
227     bool headerInOut = false;                  /// --H|header
228     bool headerIn = false;                     /// --I|header-in-only
229     size_t linesPerFile = 0;                   /// --l|lines-per-file
230     uint numFiles = 0;                         /// --n|num-files
231     size_t[] keyFields;                        /// --k|key-fields
232     string dir;                                /// --dir
233     string prefix = "part_";                   /// --prefix
234     string suffix = invalidFileSuffix;         /// --suffix
235     uint digitWidth = 0;                       /// --w|digit-width
236     bool appendToExistingFiles = false;        /// --a|append
237     bool staticSeed = false;                   /// --s|static-seed
238     uint seedValueOptionArg = 0;               /// --v|seed-value
239     char delim = '\t';                         /// --d|delimiter
240     uint maxOpenFilesArg = 0;                  /// --max-open-files
241     bool versionWanted = false;                /// --V|version
242     bool hasHeader = false;                    /// Derived. True if either '--H|header' or '--I|header-in-only' is set.
243     bool keyIsFullLine = false;                /// Derived. True if '--f|fields 0' is specfied.
244     bool usingUnpredictableSeed = true;        /// Derived from --static-seed, --seed-value
245     uint seed = 0;                             /// Derived from --static-seed, --seed-value
246     uint maxOpenOutputFiles;                   /// Derived.
247 
248     /** Process tsv-split command line arguments.
249      *
250      * Defines the command line options, performs validation, and derives additional
251      * state. std.getopt.getopt is called to do the main option processing followed
252      * additional validation and derivation.
253      *
254      * Help text is printed to standard output if help was requested. Error text is
255      * written to stderr if invalid input is encountered.
256      *
257      * A tuple is returned. First value is true if command line arguments were
258      * successfully processed and execution should continue, or false if an error
259      * occurred or the user asked for help. If false, the second value is the
260      * appropriate exit code (0 or 1).
261      *
262      * Returning true (execution continues) means args have been validated and derived
263      * values calculated. Field indices will have been converted to zero-based.
264      */
265     auto processArgs(ref string[] cmdArgs)
266     {
267         import std.algorithm : all, canFind, each, min;
268         import std.file : exists, isDir;
269         import std.getopt;
270         import std.math : isNaN;
271         import std.path : baseName, expandTilde, extension, stripExtension;
272         import std.typecons : Yes, No;
273         import tsv_utils.common.utils : makeFieldListOptionHandler;
274 
275         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
276 
277         try
278         {
279             arraySep = ",";    // Use comma to separate values in command line options
280             auto r = getopt(
281                 cmdArgs,
282                 "help-verbose",    "     Print more detailed help.", &helpVerbose,
283 
284                 std.getopt.config.caseSensitive,
285                 "H|header",         "     Input files have a header line. Write the header to each output file.", &headerInOut,
286                 "I|header-in-only", "     Input files have a header line. Do not write the header to output files.", &headerIn,
287                 std.getopt.config.caseInsensitive,
288 
289                 "l|lines-per-file", "NUM  Number of lines to write to each output file (excluding the header line).", &linesPerFile,
290                 "n|num-files",      "NUM  Number of output files to generate.", &numFiles,
291                 "k|key-fields",     "<field-list>  Fields to use as key. Lines with the same key are written to the same output file. Use '--k|key-fields 0' to use the entire line as the key.",
292                 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
293 
294                 "dir",              "STR  Directory to write to. Default: Current working directory.", &dir,
295                 "prefix",           "STR  Filename prefix. Default: 'part_'", &prefix,
296                 "suffix",           "STR  Filename suffix. Default: First input file extension. None for standard input.", &suffix,
297                 "w|digit-width",    "NUM  Number of digits in filename numeric portion. Default: '--l|lines-per-file': 3. '--n|num-files': Chosen so filenames have the same length. '--w|digit-width 0' uses the default.", &digitWidth,
298                 "a|append",         "     Append to existing files.", &appendToExistingFiles,
299 
300                 "s|static-seed",    "     Use the same random seed every run.", &staticSeed,
301 
302                 std.getopt.config.caseSensitive,
303                 "v|seed-value",     "NUM  Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg,
304                 std.getopt.config.caseInsensitive,
305 
306                 "d|delimiter",      "CHR  Field delimiter.", &delim,
307                 "max-open-files",   "NUM  Maximum open file handles to use. Min of 5 required.", &maxOpenFilesArg,
308 
309                 std.getopt.config.caseSensitive,
310                 "V|version",        "     Print version information and exit.", &versionWanted,
311                 std.getopt.config.caseInsensitive,
312                 );
313 
314             if (r.helpWanted)
315             {
316                 defaultGetoptPrinter(helpText, r.options);
317                 return tuple(false, 0);
318             }
319             else if (helpVerbose)
320             {
321                 defaultGetoptPrinter(helpTextVerbose, r.options);
322                 return tuple(false, 0);
323             }
324             else if (versionWanted)
325             {
326                 import tsv_utils.common.tsvutils_version;
327                 writeln(tsvutilsVersionNotice("tsv-split"));
328                 return tuple(false, 0);
329             }
330 
331             /*
332              * Validation and derivations.
333              */
334 
335             enforce(linesPerFile != 0 || numFiles != 0,
336                     "Either '--l|lines-per-file' or '--n|num-files' is required.");
337 
338             enforce(linesPerFile == 0 || numFiles == 0,
339                     "'--l|lines-per-file' and '--n|num-files' cannot be used together.");
340 
341             enforce(linesPerFile == 0 || keyFields.length == 0,
342                     "'--l|lines-per-file' and '--k|key-fields' cannot be used together.");
343 
344             enforce(numFiles != 1, "'--n|num-files must be two or more.");
345 
346             if (keyFields.length > 0)
347             {
348                 if (keyFields.length == 1 && keyFields[0] == 0)
349                 {
350                     keyIsFullLine = true;
351                 }
352                 else
353                 {
354                     enforce(keyFields.all!(x => x != 0),
355                             "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.");
356 
357                     keyFields.each!((ref x) => --x);  // Convert to zero-based indexing.
358                 }
359             }
360 
361             enforce(!(headerInOut && headerIn),
362                     "Use only one of '--H|header' and '--I|header-in-only'.");
363 
364             hasHeader = headerInOut || headerIn;
365 
366             if (!dir.empty)
367             {
368                 dir = dir.expandTilde;
369                 enforce(dir.exists, format("Directory does not exist: --dir '%s'", dir));
370                 enforce(dir.isDir, format("Path is not a directory: --dir '%s'", dir));
371             }
372 
373             /* Seed. */
374             import std.random : unpredictableSeed;
375 
376             usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0);
377 
378             if (usingUnpredictableSeed) seed = unpredictableSeed;
379             else if (seedValueOptionArg != 0) seed = seedValueOptionArg;
380             else if (staticSeed) seed = 2438424139;
381             else assert(0, "Internal error, invalid seed option states.");
382 
383             /* Maximum number of open files. Mainly applies when --num-files is used.
384              *
385              * Derive maxOpenOutputFiles. Inputs:
386              * - Internal default limit: 4096. This is a somewhat conservative setting.
387              * - rlimit open files limit. Defined by '$ ulimit -n'.
388              * - '--max-open-files' (maxOpenFilesArg). This adjusts the internal limit,
389              *   but only up to the rlimit value.
390              * - Four open files are reserved for stdin, stdout, stderr, and one input
391              *   file.
392              */
393 
394             immutable uint internalDefaultMaxOpenFiles = 4096;
395             immutable uint numReservedOpenFiles = 4;
396             immutable uint rlimitOpenFilesLimit = rlimitCurrOpenFilesLimit();
397 
398 
399             enforce(maxOpenFilesArg == 0 || maxOpenFilesArg > numReservedOpenFiles,
400                     format("'--max-open-files' must be at least %d.",
401                            numReservedOpenFiles + 1));
402 
403             enforce(maxOpenFilesArg <= rlimitOpenFilesLimit,
404                     format("'--max-open-files' value (%d) greater current system limit (%d)." ~
405                            "\nRun 'ulimit -n' to see the soft limit." ~
406                            "\nRun 'ulimit -Hn' to see the hard limit." ~
407                            "\nRun 'ulimit -Sn NUM' to change the soft limit.",
408                            maxOpenFilesArg, rlimitOpenFilesLimit));
409 
410             enforce(rlimitOpenFilesLimit > numReservedOpenFiles,
411                     format("System open file limit too small. Current value: %d. Must be %d or more." ~
412                            "\nRun 'ulimit -n' to see the soft limit." ~
413                            "\nRun 'ulimit -Hn' to see the hard limit." ~
414                            "\nRun 'ulimit -Sn NUM' to change the soft limit.",
415                            rlimitOpenFilesLimit, numReservedOpenFiles + 1));
416 
417             immutable uint openFilesLimit =
418                 (maxOpenFilesArg != 0)
419                 ? maxOpenFilesArg
420                 : min(internalDefaultMaxOpenFiles, rlimitOpenFilesLimit);
421 
422             assert(openFilesLimit > numReservedOpenFiles);
423 
424             maxOpenOutputFiles = openFilesLimit - numReservedOpenFiles;
425 
426             /* Remaining command line args are files.
427              */
428             string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
429             cmdArgs.length = 1;
430             ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
431             inputSources = inputSourceRange(filepaths, readHeader);
432 
433             /* Suffix - If not provided, use the extension of the first input file.
434              * No suffix if reading from standard input.
435              */
436             if (suffix == invalidFileSuffix) suffix = filepaths[0].extension;
437 
438             /* Ensure forward slash is not included in the filename prefix and suffix.
439              * Forward slash is an invalid Unix filename character. However, open file
440              * calls could match a directory path, resulting in unintended file
441              * creation.
442              *
443              * The other invalid filename character on Unix is the NULL character.
444              * However, the NULL character cannot be entered via Unix command lines,
445              * so there is no need to test for it explicitly.
446              */
447             enforce(!prefix.canFind('/'),
448                     "'--prefix' cannot contain forward slash characters. Use '--dir' to specify an output directory.");
449 
450             enforce(!suffix.canFind('/'),
451                     "'--suffix' cannot contain forward slash characters. Use '--dir' to specify an output directory.");
452 
453             /* Digit width - If not specified, or specified as zero, the width is
454              * determined by the number of files for --num-files, or defaulted to 3
455              * for --lines-per-file.
456              */
457             if (digitWidth == 0)
458             {
459                 if (numFiles > 0)
460                 {
461                     digitWidth = 1;
462                     uint n = numFiles - 1;
463                     while (n >= 10)
464                     {
465                         n /= 10;
466                         ++digitWidth;
467                     }
468                 }
469                 else
470                 {
471                     digitWidth = 3;
472                 }
473             }
474             assert(digitWidth != 0);
475         }
476         catch (Exception exc)
477         {
478             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
479             return tuple(false, 1);
480         }
481         return tuple(true, 0);
482     }
483 }
484 
485 /* TsvSplitOptions unit tests (command-line argument processing).
486  *
487  * Basic tests. Many cases are covered in executable tests, including all error cases,
488  * as errors write to stderr.
489  */
490 unittest
491 {
492     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
493     import std.conv : to;
494     import std.file : mkdir, rmdirRecurse;
495     import std.path : buildPath;
496 
497     /* A dummy file is used so we don't have to worry about the cases where command
498      * line processing might open a file. Don't want to use stanard input for this,
499      * at least in cases where it might try to read to get the header line.
500      */
501     auto testDir = makeUnittestTempDir("tsv_split_bylinecount");
502     scope(exit) testDir.rmdirRecurse;
503 
504     string somefile_txt = buildPath(testDir, "somefile.txt");
505     somefile_txt.File("w").writeln("Hello World!");
506 
507     {
508         auto args = ["unittest", "--lines-per-file", "10", somefile_txt];
509         TsvSplitOptions cmdopt;
510         const r = cmdopt.processArgs(args);
511 
512         assert(cmdopt.linesPerFile == 10);
513         assert(cmdopt.keyFields.empty);
514         assert(cmdopt.numFiles == 0);
515         assert(cmdopt.hasHeader == false);
516     }
517     {
518         auto args = ["unittest", "--num-files", "20", somefile_txt];
519         TsvSplitOptions cmdopt;
520         const r = cmdopt.processArgs(args);
521 
522         assert(cmdopt.linesPerFile == 0);
523         assert(cmdopt.keyFields.empty);
524         assert(cmdopt.numFiles == 20);
525         assert(cmdopt.hasHeader == false);
526     }
527     {
528         auto args = ["unittest", "-n", "5", "--key-fields", "1-3", somefile_txt];
529         TsvSplitOptions cmdopt;
530         const r = cmdopt.processArgs(args);
531 
532         assert(cmdopt.linesPerFile == 0);
533         assert(cmdopt.keyFields == [0, 1, 2]);
534         assert(cmdopt.numFiles == 5);
535         assert(cmdopt.hasHeader == false);
536         assert(cmdopt.keyIsFullLine == false);
537     }
538     {
539         auto args = ["unittest", "-n", "5", "-k", "0", somefile_txt];
540         TsvSplitOptions cmdopt;
541         const r = cmdopt.processArgs(args);
542 
543         assert(cmdopt.linesPerFile == 0);
544         assert(cmdopt.numFiles == 5);
545         assert(cmdopt.hasHeader == false);
546         assert(cmdopt.keyIsFullLine == true);
547     }
548     {
549         auto args = ["unittest", "-n", "2", "--header", somefile_txt];
550         TsvSplitOptions cmdopt;
551         const r = cmdopt.processArgs(args);
552 
553         assert(cmdopt.headerInOut == true);
554         assert(cmdopt.hasHeader == true);
555         assert(cmdopt.headerIn == false);
556     }
557     {
558         auto args = ["unittest", "-n", "2", "--header-in-only", somefile_txt];
559         TsvSplitOptions cmdopt;
560         const r = cmdopt.processArgs(args);
561 
562         assert(cmdopt.headerInOut == false);
563         assert(cmdopt.hasHeader == true);
564         assert(cmdopt.headerIn == true);
565     }
566 
567     static void testSuffix(string[] args, string expectedSuffix)
568     {
569         TsvSplitOptions cmdopt;
570         auto savedArgs = args.to!string;
571         const r = cmdopt.processArgs(args);
572 
573         assert(r[0], format("[testSuffix] cmdopt.processArgs(%s) returned false.", savedArgs));
574         assert(cmdopt.suffix == expectedSuffix,
575                format("[testSuffix] Incorrect cmdopt.suffix. Expected: '%s', Actual: '%s'\n   cmdopt.processArgs(%s)",
576                       expectedSuffix, cmdopt.suffix, savedArgs));
577     }
578 
579     /* In these tests, don't use headers and when files are listed, use 'somefile_txt' first.
580      * This make sure there is no attempt to read standard input and that there won't be an
581      * open failure trying to find a file.
582      */
583     testSuffix(["unittest", "-n", "2"], "");
584     testSuffix(["unittest", "-n", "2", "--", "-"], "");
585     testSuffix(["unittest", "-n", "2", "--suffix", "_123"], "_123");
586     testSuffix(["unittest", "-n", "2", somefile_txt], ".txt");
587     testSuffix(["unittest", "-n", "2", somefile_txt, "anotherfile.pqr"], ".txt");
588     testSuffix(["unittest", "-n", "2", "--suffix", ".X", somefile_txt, "anotherfile.pqr"], ".X");
589     testSuffix(["unittest", "-n", "2", "--suffix", "", somefile_txt], "");
590     testSuffix(["unittest", "-n", "2", "--", "-", somefile_txt], "");
591     testSuffix(["unittest", "-n", "2", "--", somefile_txt, "-"], ".txt");
592 
593     static void testDigitWidth(string[] args, uint expected)
594     {
595         TsvSplitOptions cmdopt;
596         auto savedArgs = args.to!string;
597         const r = cmdopt.processArgs(args);
598 
599         assert(r[0], format("[testDigitWidth] cmdopt.processArgs(%s) returned false.", savedArgs));
600         assert(cmdopt.digitWidth == expected,
601                format("[testDigitWidth] Incorrect cmdopt.digitWidth. Expected: %d, Actual: %d\n   cmdopt.processArgs(%s)",
602                       expected, cmdopt.digitWidth, savedArgs));
603     }
604 
605     testDigitWidth(["unittest", "-n", "2", somefile_txt], 1);
606     testDigitWidth(["unittest", "-n", "2", "--digit-width" , "0", somefile_txt], 1);
607     testDigitWidth(["unittest", "-n", "10", somefile_txt], 1);
608     testDigitWidth(["unittest", "-n", "11", somefile_txt], 2);
609     testDigitWidth(["unittest", "-n", "555", somefile_txt], 3);
610     testDigitWidth(["unittest", "-n", "555", "--digit-width" , "2", somefile_txt], 2);
611     testDigitWidth(["unittest", "-n", "555", "--digit-width" , "4", somefile_txt], 4);
612     testDigitWidth(["unittest", "-l", "10", somefile_txt], 3);
613     testDigitWidth(["unittest", "-l", "10000", somefile_txt], 3);
614     testDigitWidth(["unittest", "-l", "10000", "--digit-width", "0", somefile_txt], 3);
615     testDigitWidth(["unittest", "-l", "10000", "--digit-width", "1", somefile_txt], 1);
616     testDigitWidth(["unittest", "-l", "10000", "--digit-width", "5", somefile_txt], 5);
617 }
618 
619 /** Get the rlimit current number of open files the process is allowed.
620  *
621  * This routine returns the current soft limit on the number of open files the process
622  * is allowed. This is the number returned by the command: '$ ulimit -n'.
623  *
624  * This routine translates this value to a 'uint', as tsv-split uses 'uint' for
625  * tracking output files. The rlimit 'rlim_t' type is usually 'ulong' or 'long'.
626  * RLIM_INFINITY and any value larger than 'uint.max' is translated to 'uint.max'.
627  *
628  * An exception is thrown if call to 'getrlimit' fails.
629  */
630 uint rlimitCurrOpenFilesLimit()
631 {
632     import core.sys.posix.sys.resource :
633         rlim_t, rlimit, getrlimit, RLIMIT_NOFILE, RLIM_INFINITY, RLIM_SAVED_CUR;
634     import std.conv : to;
635 
636     uint currOpenFileLimit = uint.max;
637 
638     rlimit rlimitMaxOpenFiles;
639 
640     enforce(getrlimit(RLIMIT_NOFILE, &rlimitMaxOpenFiles) == 0,
641             "Internal error: getrlimit call failed");
642 
643     if (rlimitMaxOpenFiles.rlim_cur != RLIM_INFINITY &&
644         rlimitMaxOpenFiles.rlim_cur != RLIM_SAVED_CUR &&
645         rlimitMaxOpenFiles.rlim_cur >= 0 &&
646         rlimitMaxOpenFiles.rlim_cur <= uint.max)
647     {
648         currOpenFileLimit = rlimitMaxOpenFiles.rlim_cur.to!uint;
649     }
650 
651     return currOpenFileLimit;
652 }
653 
654 /** Invokes the proper split routine based on the command line arguments.
655  *
656  * This routine is the top-level control after command line argument processing is
657  * done. It's primary job is to set up data structures and invoke the correct
658  * processing routine based on the command line arguments.
659  */
660 void tsvSplit(ref TsvSplitOptions cmdopt)
661 {
662     /* Check that the input files were setup as expected. Should at least have one
663      * input, stdin if nothing else. */
664     assert(!cmdopt.inputSources.empty);
665 
666     if (cmdopt.linesPerFile != 0)
667     {
668         splitByLineCount(cmdopt);
669     }
670     else
671     {
672         /* Randomly distribute input lines to a specified number of files. */
673 
674         auto outputFiles =
675             SplitOutputFiles(cmdopt.numFiles, cmdopt.dir, cmdopt.prefix, cmdopt.suffix,
676                              cmdopt.digitWidth, cmdopt.headerInOut, cmdopt.maxOpenOutputFiles,
677                              cmdopt.inputSources.front.header);
678 
679         if (!cmdopt.appendToExistingFiles)
680         {
681             string existingFile = outputFiles.checkIfFilesExist;
682             enforce(existingFile.length == 0,
683                     format("One or more output files already exist. Use '--a|append' to append to existing files. File: '%s'.",
684                            existingFile));
685         }
686 
687         if (cmdopt.keyFields.length == 0)
688         {
689             splitLinesRandomly(cmdopt, outputFiles);
690         }
691         else
692         {
693             splitLinesByKey(cmdopt, outputFiles);
694         }
695     }
696 }
697 
698 /** A SplitOutputFiles struct holds a collection of output files.
699  *
700  * This struct manages a collection of output files used when writing to multiple
701  * files at once. This includes constructing filenames, opening and closing files,
702  * and writing data and header lines.
703  *
704  * Both random assignment (splitLinesRandomly) and random assignment by key
705  * (splitLinesByKey) use a SplitOutputFiles struct to manage output files.
706  *
707  * The main properties of the output file set are specified in the constuctor. The
708  * exception is the header line. This is not known until the first input file is
709  * read, so it is specified in a separate 'setHeader' call.
710  *
711  * Individual output files are written to based on their zero-based index in the
712  * output collection. The caller selects the output file number to write to and
713  * calls 'writeDataLine' to write a line. The header is written if needed.
714  */
715 struct SplitOutputFiles
716 {
717     import std.conv : to;
718     import std.file : exists;
719     import std.path : buildPath;
720     import std.stdio : File;
721 
722     static struct OutputFile
723     {
724         string filename;
725         File ofile;
726         bool hasData;
727         bool isOpen;    // Track separately due to https://github.com/dlang/phobos/pull/7397
728     }
729 
730     private uint _numFiles;
731     private bool _writeHeaders;
732     private uint _maxOpenFiles;
733 
734     private OutputFile[] _outputFiles;
735     private uint _numOpenFiles = 0;
736     private string _header;
737 
738     this(uint numFiles, string dir, string filePrefix, string fileSuffix,
739          uint fileDigitWidth, bool writeHeaders, uint maxOpenFiles, string header)
740     {
741         assert(numFiles >= 2);
742         assert(maxOpenFiles >= 1);
743 
744         _numFiles = numFiles;
745         _writeHeaders = writeHeaders;
746         _maxOpenFiles = maxOpenFiles;
747         _header = header;
748 
749         _outputFiles.length = numFiles;
750 
751         /* Filename assignment. */
752         foreach (i, ref f; _outputFiles)
753         {
754             f.filename =
755                 buildPath(dir, format("%s%.*d%s", filePrefix, fileDigitWidth, i, fileSuffix));
756         }
757     }
758 
759     /* Destructor ensures all files are closed.
760      *
761      * Note: A dual check on whether the file is open is made. This is to avoid a
762      * Phobos bug where std.File doesn't properly maintain the state of open files
763      * if the File.open call fails. See: https://github.com/dlang/phobos/pull/7397.
764      */
765     ~this()
766     {
767         foreach (ref f; _outputFiles)
768         {
769             if (f.isOpen && f.ofile.isOpen)
770             {
771                 assert(_numOpenFiles >= 1);
772 
773                 f.ofile.close;
774                 f.isOpen = false;
775                 _numOpenFiles--;
776             }
777         }
778     }
779 
780     /* Check if any of the files already exist.
781      *
782      * Returns the empty string if none of the files exist. Otherwise returns the
783      * filename of the first existing file found. This is to facilitate error
784      * message generation.
785      */
786     string checkIfFilesExist()
787     {
788         foreach (f; _outputFiles) if (f.filename.exists) return f.filename;
789         return "";
790     }
791 
792     /* Picks a random file to close. Used when the open file handle limit has been
793      * reached.
794      */
795     private void closeSomeFile()
796     {
797         import std.random : uniform;
798         assert(_numOpenFiles > 0);
799 
800         immutable uint start = uniform(0, _numFiles);
801 
802         foreach (i; cycle(iota(_numFiles), start).take(_numFiles))
803         {
804             if (_outputFiles[i].isOpen)
805             {
806                 _outputFiles[i].ofile.close;
807                 _outputFiles[i].isOpen = false;
808                 _numOpenFiles--;
809 
810                 return;
811             }
812         }
813 
814         assert(false, "[SplitOutputFiles.closeSomeFile]: Could not find file to close.");
815     }
816 
817     /* Write a line to the specified file number.
818      *
819      * A header is written to the file if headers are being written and this is the
820      * first data written to the file.
821      */
822     void writeDataLine(uint fileNum, const char[] data)
823     {
824         assert(fileNum < _numFiles);
825         assert(fileNum < _outputFiles.length);
826         assert(_numOpenFiles <= _maxOpenFiles);
827 
828         OutputFile* outputFile = &_outputFiles[fileNum];
829 
830         if (!outputFile.isOpen)
831         {
832             if (_numOpenFiles == _maxOpenFiles) closeSomeFile();
833             assert(_numOpenFiles < _maxOpenFiles);
834 
835             outputFile.ofile = outputFile.filename.File("a");
836             outputFile.isOpen = true;
837             _numOpenFiles++;
838 
839             if (!outputFile.hasData)
840             {
841                 ulong filesize = outputFile.ofile.size;
842                 outputFile.hasData = (filesize > 0 && filesize != ulong.max);
843             }
844         }
845 
846         if (_writeHeaders && !outputFile.hasData) outputFile.ofile.writeln(_header);
847 
848         outputFile.ofile.writeln(data);
849         outputFile.hasData = true;
850     }
851 }
852 
853 /** Write input lines to multiple files, randomly selecting an output file for each line.
854  */
855 void splitLinesRandomly(ref TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles)
856 {
857     import std.random : Random = Mt19937, uniform;
858     import tsv_utils.common.utils : bufferedByLine, InputSourceRange;
859 
860     /* inputSources must be an InputSourceRange and include at least stdin. */
861     assert(!cmdopt.inputSources.empty);
862     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
863 
864     auto randomGenerator = Random(cmdopt.seed);
865 
866     /* Process each line. */
867     foreach (inputStream; cmdopt.inputSources)
868     {
869         foreach (line; inputStream.file.bufferedByLine)
870         {
871             immutable uint outputFileNum = uniform(0, cmdopt.numFiles, randomGenerator);
872             outputFiles.writeDataLine(outputFileNum, line);
873         }
874     }
875 }
876 
877 /** Write input lines to multiple output files using fields as a random selection key.
878  *
879  * Each input line is written to an output file. The output file is chosen using
880  * fields as a key. Each unique key is assigned to a file. All lines having the
881  * same key are written to the same file.
882  */
883 void splitLinesByKey(ref TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles)
884 {
885     import std.algorithm : splitter;
886     import std.conv : to;
887     import std.digest.murmurhash;
888     import tsv_utils.common.utils : bufferedByLine, InputFieldReordering,
889         InputSourceRange, throwIfWindowsNewlineOnUnix;
890 
891     assert(cmdopt.keyFields.length > 0);
892 
893     /* inputSources must be an InputSourceRange and include at least stdin. */
894     assert(!cmdopt.inputSources.empty);
895     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
896 
897     immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys.
898 
899     /* Create a mapping for the key fields. */
900     auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields);
901 
902     /* Process each line. */
903     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
904     foreach (inputStream; cmdopt.inputSources)
905     {
906         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
907 
908         foreach (fileLineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine))
909         {
910             if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum);
911 
912             /* Murmurhash works by successively adding individual keys, then finalizing.
913              * Adding individual keys is simpler if the full-line-as-key and individual
914              * fields as keys cases are separated.
915              */
916             auto hasher = MurmurHash3!32(cmdopt.seed);
917 
918             if (cmdopt.keyIsFullLine)
919             {
920                 hasher.put(cast(ubyte[]) line);
921             }
922             else
923             {
924                 assert(keyFieldsReordering !is null);
925 
926                 /* Gather the key field values and assemble the key. */
927                 keyFieldsReordering.initNewLine;
928                 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
929                 {
930                     keyFieldsReordering.processNextField(fieldIndex, fieldValue);
931                     if (keyFieldsReordering.allFieldsFilled) break;
932                 }
933 
934                 enforce(keyFieldsReordering.allFieldsFilled,
935                         format("Not enough fields in line. File: %s, Line: %s",
936                                inputStream.name, fileLineNum));
937 
938                 foreach (count, key; keyFieldsReordering.outputFields.enumerate)
939                 {
940                     if (count > 0) hasher.put(delimArray);
941                     hasher.put(cast(ubyte[]) key);
942                 }
943             }
944 
945             hasher.finish;
946             immutable uint outputFileNum = hasher.get % cmdopt.numFiles;
947             outputFiles.writeDataLine(outputFileNum, line);
948         }
949     }
950 }
951 
952 /** Write input lines to multiple files, splitting based on line count.
953  *
954  * Note: readBufferSize is an argument primarily for unit test purposes. Normal uses
955  * should use the default value.
956  */
957 void splitByLineCount(ref TsvSplitOptions cmdopt, const size_t readBufferSize = 1024L * 128L)
958 {
959     import std.file : exists;
960     import std.path : buildPath;
961     import std.stdio : File;
962     import tsv_utils.common.utils : InputSourceRange;
963 
964     assert (readBufferSize > 0);
965     ubyte[] readBuffer = new ubyte[readBufferSize];
966 
967     /* inputSources must be an InputSourceRange and include at least stdin. */
968     assert(!cmdopt.inputSources.empty);
969     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
970 
971     string header = !cmdopt.headerInOut ? "" :
972         cmdopt.inputSources.front.header(Yes.keepTerminator);
973     size_t nextOutputFileNum = 0;
974     File outputFile;
975     string outputFileName;
976     bool isOutputFileOpen = false;           // Open file status tracked separately due to phobos bugs
977     size_t outputFileRemainingLines;
978 
979     /* nextNewlineIndex finds the index of the next newline character. It is an
980      * alternative to std.algorithm.countUntil. Invoking 'find' directly results
981      * 'memchr' being used (faster). The current 'countUntil' implementation does
982      * forward to find, but the way it is done avoids the memchr call optimization.
983      */
984     static long nextNewlineIndex(const ubyte[] buffer)
985     {
986         import std.algorithm : find;
987         immutable ubyte newlineChar = '\n';
988         immutable size_t buflen = buffer.length;
989         immutable size_t findlen = buffer.find(newlineChar).length;
990 
991         return findlen > 0 ? buflen - findlen : -1;
992     }
993 
994     foreach (inputStream; cmdopt.inputSources)
995     {
996         foreach (ref ubyte[] inputChunk; inputStream.file.byChunk(readBuffer))
997         {
998             size_t nextOutputChunkStart = 0;
999             auto remainingInputChunk = inputChunk[nextOutputChunkStart .. $];
1000 
1001             while (!remainingInputChunk.empty)
1002             {
1003                 /* See if the next output file needs to be opened. */
1004                 if (!isOutputFileOpen)
1005                 {
1006                     outputFileName =
1007                         buildPath(cmdopt.dir,
1008                                   format("%s%.*d%s", cmdopt.prefix,
1009                                          cmdopt.digitWidth, nextOutputFileNum, cmdopt.suffix));
1010 
1011                     enforce(cmdopt.appendToExistingFiles || !outputFileName.exists,
1012                             format("Output file already exists. Use '--a|append' to append to existing files. File: '%s'.",
1013                                    outputFileName));
1014 
1015                     outputFile = outputFileName.File("ab");
1016                     outputFile.setvbuf(1024L * 64L, _IOFBF);
1017                     isOutputFileOpen = true;
1018                     ++nextOutputFileNum;
1019                     outputFileRemainingLines = cmdopt.linesPerFile;
1020 
1021                     if (cmdopt.headerInOut)
1022                     {
1023                         ulong filesize = outputFile.size;
1024                         if (filesize == 0 || filesize == ulong.max) outputFile.rawWrite(header);
1025                     }
1026                 }
1027 
1028                 /* Find more newlines for the current output file. */
1029 
1030                 assert(outputFileRemainingLines > 0);
1031 
1032                 size_t nextOutputChunkEnd = nextOutputChunkStart;
1033 
1034                 while (outputFileRemainingLines != 0 && !remainingInputChunk.empty)
1035                 {
1036                     /* Note: newLineIndex is relative to 'remainingInputChunk', not
1037                      * 'inputChunk'. Updates to variables referring to 'inputChunk'
1038                      * need to reflect this. In particular, 'nextOutputChunkEnd'.
1039                      */
1040                     immutable newlineIndex = nextNewlineIndex(remainingInputChunk);
1041 
1042                     if (newlineIndex == -1)
1043                     {
1044                         nextOutputChunkEnd = inputChunk.length;
1045                     }
1046                     else
1047                     {
1048                         --outputFileRemainingLines;
1049                         nextOutputChunkEnd += (newlineIndex + 1);
1050                     }
1051 
1052                     remainingInputChunk = inputChunk[nextOutputChunkEnd .. $];
1053                 }
1054 
1055                 assert(nextOutputChunkStart < nextOutputChunkEnd);
1056                 assert(nextOutputChunkEnd <= inputChunk.length);
1057 
1058                 outputFile.rawWrite(inputChunk[nextOutputChunkStart .. nextOutputChunkEnd]);
1059 
1060                 if (outputFileRemainingLines == 0)
1061                 {
1062                     outputFile.close;
1063                     isOutputFileOpen = false;
1064                 }
1065 
1066                 nextOutputChunkStart = nextOutputChunkEnd;
1067 
1068                 assert(remainingInputChunk.length == inputChunk.length - nextOutputChunkStart);
1069             }
1070         }
1071     }
1072 }
1073 
1074 /* splitByLineCount unit tests.
1075  *
1076  * These tests are primarily for buffer management. There are edge cases involving the
1077  * interaction buffer size, input file size, lines-per-file, and newline placement
1078  * that are difficult to test against the executable.
1079  */
1080 unittest
1081 {
1082     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
1083     import std.algorithm : min;
1084     import std.array : appender;
1085     import std.conv : to;
1086     import std.file : exists, mkdir, rmdirRecurse;
1087     import std.path : buildPath;
1088     import std.process : escapeShellCommand, executeShell;
1089 
1090     /* Test setup
1091      *
1092      * A set of twenty file input files is created, with names: input_NxM.txt, where
1093      * N is the number of characters in each row and M is the number of rows (lines).
1094      * The resulting files are put in the "lc_input" directory ('inputDir' variable)
1095      * and have names:
1096      *    input_0x2.txt, input_0x3.txt, ... input_5x5.txt.
1097      *
1098      * A standalone block of code produces the expected result files for splitting an
1099      * input file into a set of output files. This duplicates the splitByLineCount
1100      * output. This is done for lines-per-file counts 1 to 5. Each result set is place
1101      * ina subdirectory under "lc_expected" ('expectedDir' variable). Subdirectories
1102      * have names like: "0x2_by_1", "0x3_by_1", ..., "5x5_by_4".
1103      *
1104      * splitByLine is called for all the same input files and lines-per-file settings used
1105      * to produce the expected output. This is done via testSplitByLineCount, which calls
1106      * command line argument processing and splitByLine, similar to how the main program
1107      * works. The results are written to a subdirectory. The subdirectory is compared to
1108      * the expected output directory using the system 'diff' command.
1109      *
1110      * splitByLine is multiple times for each expected output case. The different calls
1111      * iterate over a series of small ReadBufferSizes. This is how tests for edge cases
1112      * in the readBufferSize vs line lengths, newline placement, etc., is accomplished.
1113      *
1114      * Note: One way to understand what is going on is to comment out the line:
1115      *
1116      *    scope(exit) testDir.rmdirRecurse;
1117      *
1118      * Then run the test (e.g. 'make test') and look at the directory structure left
1119      * behind. Print out the 'testDir' directory to see where it is located.
1120      */
1121 
1122     /* testSplitByLineCount acts as a surrogate for main() and tsvSplit(). It makes the
1123      * call to splitByLineCount and calls 'diff' to compare the output directory to the
1124      * expected directory. An assert is thrown if the directories do not match.
1125      */
1126     static void testSplitByLineCount(string[] cmdArgs, string expectedDir,
1127                                  size_t readBufferSize = 1024L * 512L)
1128     {
1129         import std.array : appender;
1130 
1131         assert(cmdArgs.length > 0, "[testSplitByLineCount] cmdArgs must not be empty.");
1132 
1133         auto formatAssertMessage(T...)(string msg, T formatArgs)
1134         {
1135             auto formatString = "[testSplitByLineCount] %s: " ~ msg;
1136             return format(formatString, cmdArgs[0], formatArgs);
1137         }
1138 
1139         TsvSplitOptions cmdopt;
1140         auto savedCmdArgs = cmdArgs.to!string;
1141         auto r = cmdopt.processArgs(cmdArgs);
1142         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
1143         assert(cmdopt.linesPerFile != 0, "[testSplitByLineCount] --lines-per-file is required.");
1144         assert(!cmdopt.dir.empty, "[testSplitByLineCount] --dir is required.");
1145 
1146         splitByLineCount(cmdopt, readBufferSize);
1147 
1148         /* Diff command setup. */
1149         auto diffCmdArgs = ["diff", expectedDir, cmdopt.dir];
1150         auto diffResult = executeShell(escapeShellCommand(diffCmdArgs));
1151         assert(diffResult.status == 0,
1152                format("[testSplitByLineCount]\n  cmd: %s\n  readBufferSize: %d\n  expectedDir: %s\n------ Diff ------%s\n-------",
1153                       savedCmdArgs, readBufferSize, expectedDir, diffResult.output));
1154     }
1155 
1156     auto testDir = makeUnittestTempDir("tsv_split_bylinecount");
1157     scope(exit) testDir.rmdirRecurse;
1158 
1159     auto inputDir = buildPath(testDir, "lc_input");
1160     auto outputDir = buildPath(testDir, "lc_output");
1161     auto expectedDir = buildPath(testDir, "lc_expected");
1162 
1163     mkdir(inputDir);
1164     mkdir(outputDir);
1165     mkdir(expectedDir);
1166 
1167     static string buildInputFilePath(string dir, long inputLineLength, long inputFileNumLines)
1168     {
1169         return buildPath(dir, format("input_%dx%d.txt", inputLineLength, inputFileNumLines));
1170     }
1171 
1172     string[5] outputRowData =
1173         [
1174             "abcde",
1175             "fghij",
1176             "klmno",
1177             "pqrst",
1178             "uvwxy"
1179         ];
1180 
1181     /* The main test loop. Iterates over input line lengths, numbers of rows,
1182      * lines-per-file, and finally readBufferSize lengths. All combos are tested.
1183      */
1184     foreach (inputLineLength; 0 .. 6)
1185     {
1186         foreach (inputFileNumLines; 2 .. 6)
1187         {
1188             auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines);
1189 
1190             {
1191                 auto ofile = inputFile.File("w");
1192                 auto output = appender!(char[])();
1193                 foreach (m; 0 .. inputFileNumLines)
1194                 {
1195                     put(output, outputRowData[m][0 .. inputLineLength]);
1196                     put(output, '\n');
1197                 }
1198                 ofile.write(output.data);
1199                 ofile.close;
1200             }
1201 
1202             /* Iterate over the different lines-per-file lengths.
1203              * - Create an expected output directory and files for each.
1204              * - Test with different readBufferSize values.
1205              */
1206             foreach (outputFileNumLines; 1 .. min(5, inputFileNumLines))
1207             {
1208                 auto expectedSubDir =
1209                     buildPath(expectedDir, format("%dx%d_by_%d", inputLineLength,
1210                                                   inputFileNumLines, outputFileNumLines));
1211                 mkdir(expectedSubDir);
1212 
1213                 size_t filenum = 0;
1214                 size_t linesWritten = 0;
1215                 while (linesWritten < inputFileNumLines)
1216                 {
1217                     auto expectedFile = buildPath(expectedSubDir, format("part_%d.txt", filenum));
1218                     auto f = expectedFile.File("w");
1219                     auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten);
1220                     foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite])
1221                     {
1222                         f.writeln(line[0 .. inputLineLength]);
1223                     }
1224                     linesWritten += linesToWrite;
1225                     ++filenum;
1226                     f.close;
1227                 }
1228 
1229                 /* Test the different readBufferSizes.
1230                  * - An output directory is created for the run and deleted afterward.
1231                  * - First test the default size.
1232                  * - Then iterate overs small readBufferSize values.
1233                  */
1234                 auto outputSubDir =
1235                     buildPath(outputDir, format("%dx%d_by_%d", inputLineLength,
1236                                                 inputFileNumLines, outputFileNumLines));
1237                 mkdir(outputSubDir);
1238 
1239                 testSplitByLineCount(
1240                     ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir,
1241                      "--digit-width", "1", inputFile],
1242                     expectedSubDir);
1243 
1244                 outputSubDir.rmdirRecurse;
1245 
1246                 foreach (readBufSize; 1 .. 8)
1247                 {
1248                      mkdir(outputSubDir);
1249 
1250                      testSplitByLineCount(
1251                          ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir,
1252                           "--digit-width", "1", inputFile],
1253                          expectedSubDir, readBufSize);
1254 
1255                      outputSubDir.rmdirRecurse;
1256                 }
1257             }
1258         }
1259     }
1260 
1261     {
1262         /* Tests for the special case where readBufferSize is smaller than the header
1263          * line. We'll reuse the input_5x4.txt input file and write 1 line-per-file.
1264          */
1265         immutable inputLineLength = 5;
1266         immutable inputFileNumLines = 4;
1267         immutable outputFileNumLines = 1;
1268 
1269         auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines);
1270         assert(inputFile.exists);
1271 
1272         auto expectedSubDirHeader =
1273             buildPath(expectedDir, format("%dx%d_by_%d_header", inputLineLength,
1274                                           inputFileNumLines, outputFileNumLines));
1275 
1276         auto expectedSubDirHeaderInOnly =
1277             buildPath(expectedDir, format("%dx%d_by_%d_header_in_only", inputLineLength,
1278                                           inputFileNumLines, outputFileNumLines));
1279 
1280         mkdir(expectedSubDirHeader);
1281         mkdir(expectedSubDirHeaderInOnly);
1282 
1283         /* Generate the expected results. Cheat by starting with linesWritten = 1. This
1284          * automatically excludes the header line, but keeps the loop code consistent
1285          * with the main test loop.
1286          */
1287         size_t filenum = 0;
1288         size_t linesWritten = 1;
1289         while (linesWritten < inputFileNumLines)
1290         {
1291             auto expectedFileHeader = buildPath(expectedSubDirHeader, format("part_%d.txt", filenum));
1292             auto expectedFileHeaderInOnly = buildPath(expectedSubDirHeaderInOnly,
1293                                                       format("part_%d.txt", filenum));
1294             auto fHeader = expectedFileHeader.File("w");
1295             auto fHeaderInOnly = expectedFileHeaderInOnly.File("w");
1296             auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten);
1297 
1298             fHeader.writeln(outputRowData[0][0 .. inputLineLength]);
1299             foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite])
1300             {
1301                 fHeader.writeln(line[0 .. inputLineLength]);
1302                 fHeaderInOnly.writeln(line[0 .. inputLineLength]);
1303             }
1304             linesWritten += linesToWrite;
1305             ++filenum;
1306             fHeader.close;
1307             fHeaderInOnly.close;
1308         }
1309 
1310         /* Now run the tests. */
1311         auto outputSubDirHeader =
1312             buildPath(outputDir, format("%dx%d_by_%d_header", inputLineLength,
1313                                         inputFileNumLines, outputFileNumLines));
1314         auto outputSubDirHeaderInOnly =
1315             buildPath(outputDir, format("%dx%d_by_%d_header_in_only", inputLineLength,
1316                                         inputFileNumLines, outputFileNumLines));
1317 
1318         foreach (readBufSize; 1 .. 6)
1319         {
1320             mkdir(outputSubDirHeader);
1321             mkdir(outputSubDirHeaderInOnly);
1322 
1323             testSplitByLineCount(
1324                 ["test", "--header", "--lines-per-file", outputFileNumLines.to!string,
1325                  "--dir", outputSubDirHeader, "--digit-width", "1", inputFile],
1326                 expectedSubDirHeader, readBufSize);
1327 
1328             testSplitByLineCount(
1329                 ["test", "--header-in-only", "--lines-per-file", outputFileNumLines.to!string,
1330                  "--dir", outputSubDirHeaderInOnly, "--digit-width", "1", inputFile],
1331                 expectedSubDirHeaderInOnly, readBufSize);
1332 
1333             outputSubDirHeader.rmdirRecurse;
1334             outputSubDirHeaderInOnly.rmdirRecurse;
1335         }
1336     }
1337 }