1 /**
2 Command line tool for splitting a files (or files) into multiple output files.
3 Several methods for splitting are available, including splitting by line count,
4 splitting by random assignment, and splitting by random assignment based on
5 key fields.
6 
7 Copyright (c) 2020-2021, eBay Inc.
8 Initially written by Jon Degenhardt
9 
10 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
11 */
12 module tsv_utils.tsv_split;
13 
14 import std.exception : enforce;
15 import std.format : format;
16 import std.range;
17 import std.stdio;
18 import std.typecons : tuple, Flag;
19 
20 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
21 
22 version(unittest)
23 {
24     // When running unit tests, use main from -main compiler switch.
25 }
26 else
27 {
28     /** Main program.
29      *
30      * Invokes command line argument processing and calls tsvSplit to do the real
31      * work. Errors occurring during processing are caught and reported to the user.
32      */
33     int main(string[] cmdArgs)
34     {
35         /* When running in DMD code coverage mode, turn on report merging. */
36         version(D_Coverage) version(DigitalMars)
37         {
38             import core.runtime : dmd_coverSetMerge;
39             dmd_coverSetMerge(true);
40         }
41 
42         TsvSplitOptions cmdopt;
43         const r = cmdopt.processArgs(cmdArgs);
44         if (!r[0]) return r[1];
45         version(LDC_Profile)
46         {
47             import ldc.profile : resetAll;
48             resetAll();
49         }
50         try
51         {
52             tsvSplit(cmdopt);
53         }
54         catch (Exception exc)
55         {
56             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
57             return 1;
58         }
59         return 0;
60     }
61 }
62 
63 immutable helpText = q"EOS
64 Synopsis: tsv-split [options] [file...]
65 
66 Split input lines into multiple output files. There are three modes of
67 operation:
68 
69 * Fixed number of lines per file (--l|lines-per-file NUM): Each input
70   block of NUM lines is written to a new file. Similar to Unix 'split'.
71 
72 * Random assignment (--n|num-files NUM): Each input line is written to a
73   randomly selected output file. Random selection is from NUM files.
74 
75 * Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS):
76   Input lines are written to output files using fields as a key. Each
77   unique key is randomly assigned to one of NUM output files. All lines
78   with the same key are written to the same file.
79 
80 By default, files are written to the current directory and have names
81 of the form 'part_NNN<suffix>', with 'NNN' being a number and <suffix>
82 being the extension of the first input file. If the input file is
83 'file.txt', the names will take the form 'part_NNN.txt'. The output
84 directory and file names are customizable.
85 
86 Fields are specified using field number or field name. Field names
87 require that the input file has a header line.
88 
89 Use '--help-verbose' for more detailed information.
90 
91 Options:
92 EOS";
93 
94 immutable helpTextVerbose = q"EOS
95 Synopsis: tsv-split [options] [file...]
96 
97 Split input lines into multiple output files. There are three modes of
98 operation:
99 
100 * Fixed number of lines per file (--l|lines-per-file NUM): Each input
101   block of NUM lines is written to a new file. Similar to Unix 'split'.
102 
103 * Random assignment (--n|num-files NUM): Each input line is written to a
104   randomly selected output file. Random selection is from NUM files.
105 
106 * Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS):
107   Input lines are written to output files using fields as a key. Each
108   unique key is randomly assigned to one of NUM output files. All lines
109   with the same key are written to the same file.
110 
111 Output files: By default, files are written to the current directory and
112 have names of the form 'part_NNN<suffix>', with 'NNN' being a number and
113 <suffix> being the extension of the first input file. If the input file is
114 'file.txt', the names will take the form 'part_NNN.txt'. The suffix is
115 empty when reading from standard input. The numeric part defaults to 3
116 digits for '--l|lines-per-files'. For '--n|num-files' enough digits are
117 used so all filenames are the same length. The output directory and file
118 names are customizable.
119 
120 Header lines: There are two ways to handle input with headers: write a
121 header to all output files (--H|header), or exclude headers from all
122 output files ('--I|header-in-only'). The best choice depends on the
123 follow-up processing. All tsv-utils tools support header lines in multiple
124 input files, but many other tools do not. For example, GNU parallel works
125 best on files without header lines.
126 
127 Random assignment (--n|num-files): Random distribution of records to a set
128 of files is a common task. When data fits in memory the preferred approach
129 is usually to shuffle the data and split it into fixed sized blocks. E.g.
130 'tsv-sample data.tsv | tsv-split -l NUM'. However, alternate approaches
131 are needed when data is too large for convenient shuffling. tsv-split's
132 random assignment feature is useful in this case. Each input line is
133 written to a randomly selected output file. Note that output files will
134 have similar but not identical numbers of records.
135 
136 Random assignment by key (--n|num-files NUM, --k|key-fields FIELDS): This
137 splits a data set into multiple files sharded by key. All lines with the
138 same key are written to the same file. This partitioning enables parallel
139 computation based on the key. For example, statistical calculation
140 ('tsv-summarize --group-by') or duplicate removal ('tsv-uniq --fields').
141 These operations can be parallelized using tools like GNU parallel, which
142 simplifies concurrent operations on multiple files. Fields are specified
143 using field number or field name. Field names require that the input file
144 has a header line. Use '--help-fields' for details about field names.
145 
146 Random seed: By default, each tsv-split invocation using random assignment
147 or random assignment by key produces different assignments to the output
148 files. Using '--s|static-seed' changes this so multiple runs produce the
149 same assignments. This works by using the same random seed each run. The
150 seed can be specified using '--v|seed-value'.
151 
152 Appending to existing files: By default, an error is triggered if an
153 output file already exists. '--a|append' changes this so that lines are
154 appended to existing files. (Header lines are not appended to files with
155 data.) This is useful when adding new data to files created by a previous
156 tsv-split run. Random assignment should use the same '--n|num-files' value
157 each run, but different random seeds (avoid '--s|static-seed'). Random
158 assignment by key should use the same '--n|num-files', '--k|key-fields',
159 and seed ('--s|static-seed' or '--v|seed-value') each run.
160 
161 Max number of open files: Random assignment and random assignment by key
162 are dramatically faster when all output files are kept open. However,
163 keeping a large numbers of open files can bump into system limits or limit
164 resources available to other processes. By default, tsv-split uses up to
165 4096 open files or the system per-process limit, whichever is smaller.
166 This can be changed using '--max-open-files', though it cannot be set
167 larger than the system limit. The system limit varies considerably between
168 systems. On many systems it is unlimited. On MacOS it is often set to 256.
169 Use Unix 'ulimit' to display and modify the limits:
170 * 'ulimit -n' - Show the "soft limit". The per-process maximum.
171 * 'ulimit -Hn' - Show the "hard limit". The max allowed soft limit.
172 * 'ulimit -Sn NUM' - Change the "soft limit" to NUM.
173 
174 Examples:
175 
176   # Split a 10 million line file into 1000 files, 10,000 lines each.
177   # Output files are part_000.txt, part_001.txt, ... part_999.txt.
178   tsv-split data.txt --lines-per-file 10000
179 
180   # Same as the previous example, but write files to a subdirectory.
181   tsv-split data.txt --dir split_files --lines-per-file 10000
182 
183   # Split a file into 10,000 line files, writing a header line to each
184   tsv-split data.txt -H --lines-per-file 10000
185 
186   # Same as the previous example, but dropping the header line.
187   tsv-split data.txt -I --lines-per-file 10000
188 
189   # Randomly assign lines to 1000 files
190   tsv-split data.txt --num-files 1000
191 
192   # Randomly assign lines to 1000 files while keeping unique entries
193   # from the 'url' field together.
194   tsv-split data.tsv -H -k url --num-files 1000
195 
196   # Randomly assign lines to 1000 files. Later, randomly assign lines
197   # from a second data file to the same output files.
198   tsv-split data1.tsv -n 1000
199   tsv-split data2.tsv -n 1000 --append
200 
201   # Randomly assign lines to 1000 files using field 3 as a key.
202   # Later, add a second file to the same output files.
203   tsv-split data1.tsv -n 1000 -k 3 --static-seed
204   tsv-split data2.tsv -n 1000 -k 3 --static-seed --append
205 
206   # Change the system per-process open file limit for one command.
207   # The parens create a sub-shell. The current shell is not changed.
208   ( ulimit -Sn 1000 && tsv-split --num-files 1000 data.txt )
209 
210 Options:
211 EOS";
212 
213 /** Container for command line options and derived data.
214  *
215  * TsvSplitOptions handles several aspects of command line options. On the input side,
216  * it defines the command line options available, performs validation, and sets up any
217  * derived state based on the options provided. These activities are handled by the
218  * processArgs() member.
219  *
220  * Once argument processing is complete, TsvSplitOptions is used as a container
221  * holding the specific processing options used by the splitting algorithms.
222  */
223 struct TsvSplitOptions
224 {
225     import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader;
226 
227     enum invalidFileSuffix = "///////";
228 
229     string programName;                        /// Program name
230     InputSourceRange inputSources;             /// Input files
231     bool headerInOut = false;                  /// --H|header
232     bool headerIn = false;                     /// --I|header-in-only
233     size_t linesPerFile = 0;                   /// --l|lines-per-file
234     uint numFiles = 0;                         /// --n|num-files
235     size_t[] keyFields;                        /// Derived: --k|key-fields
236     string dir;                                /// --dir
237     string prefix = "part_";                   /// --prefix
238     string suffix = invalidFileSuffix;         /// --suffix
239     uint digitWidth = 0;                       /// --w|digit-width
240     bool appendToExistingFiles = false;        /// --a|append
241     bool staticSeed = false;                   /// --s|static-seed
242     uint seedValueOptionArg = 0;               /// --v|seed-value
243     char delim = '\t';                         /// --d|delimiter
244     uint maxOpenFilesArg = 0;                  /// --max-open-files
245     bool hasHeader = false;                    /// Derived. True if either '--H|header' or '--I|header-in-only' is set.
246     bool keyIsFullLine = false;                /// Derived. True if '--f|fields 0' is specfied.
247     bool usingUnpredictableSeed = true;        /// Derived from --static-seed, --seed-value
248     uint seed = 0;                             /// Derived from --static-seed, --seed-value
249     uint maxOpenOutputFiles;                   /// Derived.
250 
251     /** Process tsv-split command line arguments.
252      *
253      * Defines the command line options, performs validation, and derives additional
254      * state. std.getopt.getopt is called to do the main option processing followed
255      * additional validation and derivation.
256      *
257      * Help text is printed to standard output if help was requested. Error text is
258      * written to stderr if invalid input is encountered.
259      *
260      * A tuple is returned. First value is true if command line arguments were
261      * successfully processed and execution should continue, or false if an error
262      * occurred or the user asked for help. If false, the second value is the
263      * appropriate exit code (0 or 1).
264      *
265      * Returning true (execution continues) means args have been validated and derived
266      * values calculated. Field indices will have been converted to zero-based.
267      */
268     auto processArgs(ref string[] cmdArgs)
269     {
270         import std.algorithm : all, canFind, each, min;
271         import std.conv : to;
272         import std.file : exists, isDir;
273         import std.getopt;
274         import std.math : isNaN;
275         import std.path : baseName, expandTilde, extension, stripExtension;
276         import std.typecons : Yes, No;
277         import tsv_utils.common.fieldlist;
278 
279         bool helpVerbose = false;                  // --help-verbose
280         bool helpFields = false;                   // --help-fields
281         bool versionWanted = false;                // --V|version
282         string keyFieldsArg;                       // --k|key-fields
283 
284         string keyFieldsOptionString = "k|key-fields";
285 
286         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
287 
288         try
289         {
290             arraySep = ",";    // Use comma to separate values in command line options
291             auto r = getopt(
292                 cmdArgs,
293                 "help-verbose",    "     Print more detailed help.", &helpVerbose,
294                 "help-fields",     "     Print help on specifying fields.", &helpFields,
295 
296                 std.getopt.config.caseSensitive,
297                 "H|header",         "     Input files have a header line. Write the header to each output file.", &headerInOut,
298                 "I|header-in-only", "     Input files have a header line. Do not write the header to output files.", &headerIn,
299                 std.getopt.config.caseInsensitive,
300 
301                 "l|lines-per-file", "NUM  Number of lines to write to each output file (excluding the header line).", &linesPerFile,
302                 "n|num-files",      "NUM  Number of output files to generate.", &numFiles,
303 
304                 keyFieldsOptionString,
305                 "<field-list>  Fields to use as key. Lines with the same key are written to the same output file. Use '--k|key-fields 0' to use the entire line as the key.",
306                 &keyFieldsArg,
307 
308                 "dir",              "STR  Directory to write to. Default: Current working directory.", &dir,
309                 "prefix",           "STR  Filename prefix. Default: 'part_'", &prefix,
310                 "suffix",           "STR  Filename suffix. Default: First input file extension. None for standard input.", &suffix,
311                 "w|digit-width",    "NUM  Number of digits in filename numeric portion. Default: '--l|lines-per-file': 3. '--n|num-files': Chosen so filenames have the same length. '--w|digit-width 0' uses the default.", &digitWidth,
312                 "a|append",         "     Append to existing files.", &appendToExistingFiles,
313 
314                 "s|static-seed",    "     Use the same random seed every run.", &staticSeed,
315 
316                 std.getopt.config.caseSensitive,
317                 "v|seed-value",     "NUM  Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg,
318                 std.getopt.config.caseInsensitive,
319 
320                 "d|delimiter",      "CHR  Field delimiter.", &delim,
321                 "max-open-files",   "NUM  Maximum open file handles to use. Min of 5 required.", &maxOpenFilesArg,
322 
323                 std.getopt.config.caseSensitive,
324                 "V|version",        "     Print version information and exit.", &versionWanted,
325                 std.getopt.config.caseInsensitive,
326                 );
327 
328             if (r.helpWanted)
329             {
330                 defaultGetoptPrinter(helpText, r.options);
331                 return tuple(false, 0);
332             }
333             else if (helpVerbose)
334             {
335                 defaultGetoptPrinter(helpTextVerbose, r.options);
336                 return tuple(false, 0);
337             }
338             else if (helpFields)
339             {
340                 writeln(fieldListHelpText);
341                 return tuple(false, 0);
342             }
343             else if (versionWanted)
344             {
345                 import tsv_utils.common.tsvutils_version;
346                 writeln(tsvutilsVersionNotice("tsv-split"));
347                 return tuple(false, 0);
348             }
349 
350             /* Remaining command line args are files.
351              */
352             string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
353             cmdArgs.length = 1;
354 
355             /* Validation and derivations - Do as much validation prior to header line
356              * processing as possible (avoids waiting on stdin).
357              *
358              * Note: keyFields depends on header line processing, but keyFieldsArg
359              * can be used to detect whether the command line argument was specified.
360              */
361 
362             enforce(!(headerInOut && headerIn),
363                     "Use only one of '--H|header' and '--I|header-in-only'.");
364 
365             hasHeader = headerInOut || headerIn;
366 
367             enforce(linesPerFile != 0 || numFiles != 0,
368                     "Either '--l|lines-per-file' or '--n|num-files' is required.");
369 
370             enforce(linesPerFile == 0 || numFiles == 0,
371                     "'--l|lines-per-file' and '--n|num-files' cannot be used together.");
372 
373             enforce(linesPerFile == 0 || keyFieldsArg.length == 0,
374                     "'--l|lines-per-file' and '--k|key-fields' cannot be used together.");
375 
376             enforce(numFiles != 1, "'--n|num-files must be two or more.");
377 
378             if (!dir.empty)
379             {
380                 dir = dir.expandTilde;
381                 enforce(dir.exists, format("Directory does not exist: --dir '%s'", dir));
382                 enforce(dir.isDir, format("Path is not a directory: --dir '%s'", dir));
383             }
384 
385             /* Seed. */
386             import std.random : unpredictableSeed;
387 
388             usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0);
389 
390             if (usingUnpredictableSeed) seed = unpredictableSeed;
391             else if (seedValueOptionArg != 0) seed = seedValueOptionArg;
392             else if (staticSeed) seed = 2438424139;
393             else assert(0, "Internal error, invalid seed option states.");
394 
395             /* Maximum number of open files. Mainly applies when --num-files is used.
396              *
397              * Derive maxOpenOutputFiles. Inputs:
398              * - Internal default limit: 4096. This is a somewhat conservative setting.
399              * - rlimit open files limit. Defined by '$ ulimit -n'.
400              * - '--max-open-files' (maxOpenFilesArg). This adjusts the internal limit,
401              *   but only up to the rlimit value.
402              * - Four open files are reserved for stdin, stdout, stderr, and one input
403              *   file.
404              */
405 
406             immutable uint internalDefaultMaxOpenFiles = 4096;
407             immutable uint numReservedOpenFiles = 4;
408             immutable uint rlimitOpenFilesLimit = rlimitCurrOpenFilesLimit();
409 
410             enforce(maxOpenFilesArg == 0 || maxOpenFilesArg > numReservedOpenFiles,
411                     format("'--max-open-files' must be at least %d.",
412                            numReservedOpenFiles + 1));
413 
414             enforce(maxOpenFilesArg <= rlimitOpenFilesLimit,
415                     format("'--max-open-files' value (%d) greater current system limit (%d)." ~
416                            "\nRun 'ulimit -n' to see the soft limit." ~
417                            "\nRun 'ulimit -Hn' to see the hard limit." ~
418                            "\nRun 'ulimit -Sn NUM' to change the soft limit.",
419                            maxOpenFilesArg, rlimitOpenFilesLimit));
420 
421             enforce(rlimitOpenFilesLimit > numReservedOpenFiles,
422                     format("System open file limit too small. Current value: %d. Must be %d or more." ~
423                            "\nRun 'ulimit -n' to see the soft limit." ~
424                            "\nRun 'ulimit -Hn' to see the hard limit." ~
425                            "\nRun 'ulimit -Sn NUM' to change the soft limit.",
426                            rlimitOpenFilesLimit, numReservedOpenFiles + 1));
427 
428             immutable uint openFilesLimit =
429                 (maxOpenFilesArg != 0)
430                 ? maxOpenFilesArg
431                 : min(internalDefaultMaxOpenFiles, rlimitOpenFilesLimit);
432 
433             assert(openFilesLimit > numReservedOpenFiles);
434 
435             maxOpenOutputFiles = openFilesLimit - numReservedOpenFiles;
436 
437             /* Suffix - If not provided, use the extension of the first input file.
438              * No suffix if reading from standard input.
439              */
440             if (suffix == invalidFileSuffix) suffix = filepaths[0].extension;
441 
442             /* Ensure forward slash is not included in the filename prefix and suffix.
443              * Forward slash is an invalid Unix filename character. However, open file
444              * calls could match a directory path, resulting in unintended file
445              * creation.
446              *
447              * The other invalid filename character on Unix is the NULL character.
448              * However, the NULL character cannot be entered via Unix command lines,
449              * so there is no need to test for it explicitly.
450              */
451             enforce(!prefix.canFind('/'),
452                     "'--prefix' cannot contain forward slash characters. Use '--dir' to specify an output directory.");
453 
454             enforce(!suffix.canFind('/'),
455                     "'--suffix' cannot contain forward slash characters. Use '--dir' to specify an output directory.");
456 
457             /* Digit width - If not specified, or specified as zero, the width is
458              * determined by the number of files for --num-files, or defaulted to 3
459              * for --lines-per-file.
460              */
461             if (digitWidth == 0)
462             {
463                 if (numFiles > 0)
464                 {
465                     digitWidth = 1;
466                     uint n = numFiles - 1;
467                     while (n >= 10)
468                     {
469                         n /= 10;
470                         ++digitWidth;
471                     }
472                 }
473                 else
474                 {
475                     digitWidth = 3;
476                 }
477             }
478             assert(digitWidth != 0);
479 
480             /*
481              * Create the inputSourceRange and perform header line processing.
482              */
483             ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
484             inputSources = inputSourceRange(filepaths, readHeader);
485 
486             string[] headerFields;
487 
488             if (hasHeader) headerFields = inputSources.front.header.split(delim).to!(string[]);
489 
490             if (!keyFieldsArg.empty)
491             {
492                 keyFields =
493                     keyFieldsArg
494                     .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)
495                     (hasHeader, headerFields, keyFieldsOptionString)
496                     .array;
497             }
498 
499             if (keyFields.length > 0)
500             {
501                 if (keyFields.length == 1 && keyFields[0] == 0)
502                 {
503                     keyIsFullLine = true;
504                 }
505                 else
506                 {
507                     enforce(keyFields.all!(x => x != 0),
508                             "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields.");
509 
510                     keyFields.each!((ref x) => --x);  // Convert to zero-based indexing.
511                 }
512             }
513 
514         }
515         catch (Exception exc)
516         {
517             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
518             return tuple(false, 1);
519         }
520         return tuple(true, 0);
521     }
522 }
523 
524 /* TsvSplitOptions unit tests (command-line argument processing).
525  *
526  * Basic tests. Many cases are covered in executable tests, including all error cases,
527  * as errors write to stderr.
528  */
529 unittest
530 {
531     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
532     import std.conv : to;
533     import std.file : mkdir, rmdirRecurse;
534     import std.path : buildPath;
535 
536     /* A pair of dummy files are used so we don't have to worry about the cases where
537      * command line processing might open a file. Don't want to use standard input for
538      * this, at least in cases where it might try to read to get the header line.
539      *
540      * Note: For Windows we need to ensure there are no references held to the dummy
541      * file (somefile.txt) by the time rmdirRecurse tries to remove it. So we take
542      * a step not necessary in normal code and explicitly empty the inputSources in
543      * TsvSplitOptions structs that are created during the tests. In normal code,
544      * this happens when the input sources are iterated, but the sources are not
545      * iterated in these tests.
546      */
547     auto testDir = makeUnittestTempDir("tsv_split_bylinecount");
548     scope(exit) testDir.rmdirRecurse;
549 
550     string somefile_txt = buildPath(testDir, "somefile.txt");
551     string anotherfile_pqr = buildPath(testDir, "anotherfile.pqr");
552 
553     {
554         auto f1 = somefile_txt.File("wb");
555         f1.writeln("Hello World!");
556         f1.close;
557 
558         auto f2 = anotherfile_pqr.File("wb");
559         f2.writeln("Good Morning World!");
560         f2.close;
561     }
562 
563     {
564         auto args = ["unittest", "--lines-per-file", "10", somefile_txt];
565         TsvSplitOptions cmdopt;
566         const r = cmdopt.processArgs(args);
567 
568         assert(cmdopt.linesPerFile == 10);
569         assert(cmdopt.keyFields.empty);
570         assert(cmdopt.numFiles == 0);
571         assert(cmdopt.hasHeader == false);
572 
573         while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront;
574     }
575     {
576         auto args = ["unittest", "--num-files", "20", somefile_txt];
577         TsvSplitOptions cmdopt;
578         const r = cmdopt.processArgs(args);
579 
580         assert(cmdopt.linesPerFile == 0);
581         assert(cmdopt.keyFields.empty);
582         assert(cmdopt.numFiles == 20);
583         assert(cmdopt.hasHeader == false);
584 
585         while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront;
586     }
587     {
588         auto args = ["unittest", "-n", "5", "--key-fields", "1-3", somefile_txt];
589         TsvSplitOptions cmdopt;
590         const r = cmdopt.processArgs(args);
591 
592         assert(cmdopt.linesPerFile == 0);
593         assert(cmdopt.keyFields == [0, 1, 2]);
594         assert(cmdopt.numFiles == 5);
595         assert(cmdopt.hasHeader == false);
596         assert(cmdopt.keyIsFullLine == false);
597 
598         while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront;
599     }
600     {
601         auto args = ["unittest", "-n", "5", "-k", "0", somefile_txt];
602         TsvSplitOptions cmdopt;
603         const r = cmdopt.processArgs(args);
604 
605         assert(cmdopt.linesPerFile == 0);
606         assert(cmdopt.numFiles == 5);
607         assert(cmdopt.hasHeader == false);
608         assert(cmdopt.keyIsFullLine == true);
609 
610         while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront;
611     }
612     {
613         auto args = ["unittest", "-n", "2", "--header", somefile_txt];
614         TsvSplitOptions cmdopt;
615         const r = cmdopt.processArgs(args);
616 
617         assert(cmdopt.headerInOut == true);
618         assert(cmdopt.hasHeader == true);
619         assert(cmdopt.headerIn == false);
620 
621         while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront;
622     }
623     {
624         auto args = ["unittest", "-n", "2", "--header-in-only", somefile_txt];
625         TsvSplitOptions cmdopt;
626         const r = cmdopt.processArgs(args);
627 
628         assert(cmdopt.headerInOut == false);
629         assert(cmdopt.hasHeader == true);
630         assert(cmdopt.headerIn == true);
631 
632         while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront;
633     }
634 
635     static void testSuffix(string[] args, string expectedSuffix)
636     {
637         TsvSplitOptions cmdopt;
638         auto savedArgs = args.to!string;
639         const r = cmdopt.processArgs(args);
640 
641         assert(r[0], format("[testSuffix] cmdopt.processArgs(%s) returned false.", savedArgs));
642         assert(cmdopt.suffix == expectedSuffix,
643                format("[testSuffix] Incorrect cmdopt.suffix. Expected: '%s', Actual: '%s'\n   cmdopt.processArgs(%s)",
644                       expectedSuffix, cmdopt.suffix, savedArgs));
645 
646         while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront;
647     }
648 
649     /* In these tests, don't use headers and when files are listed, use 'somefile_txt' first.
650      * This makes sure there is no attempt to read standard input and that there won't be an
651      * open failure trying to find a file.
652      */
653     testSuffix(["unittest", "-n", "2"], "");
654     testSuffix(["unittest", "-n", "2", "--", "-"], "");
655     testSuffix(["unittest", "-n", "2", "--suffix", "_123"], "_123");
656     testSuffix(["unittest", "-n", "2", somefile_txt], ".txt");
657     testSuffix(["unittest", "-n", "2", somefile_txt, anotherfile_pqr], ".txt");
658     testSuffix(["unittest", "-n", "2", "--suffix", ".X", somefile_txt, anotherfile_pqr], ".X");
659     testSuffix(["unittest", "-n", "2", "--suffix", "", somefile_txt], "");
660     testSuffix(["unittest", "-n", "2", "--", "-", somefile_txt], "");
661     testSuffix(["unittest", "-n", "2", "--", somefile_txt, "-"], ".txt");
662 
663     static void testDigitWidth(string[] args, uint expected)
664     {
665         TsvSplitOptions cmdopt;
666         auto savedArgs = args.to!string;
667         const r = cmdopt.processArgs(args);
668 
669         assert(r[0], format("[testDigitWidth] cmdopt.processArgs(%s) returned false.", savedArgs));
670         assert(cmdopt.digitWidth == expected,
671                format("[testDigitWidth] Incorrect cmdopt.digitWidth. Expected: %d, Actual: %d\n   cmdopt.processArgs(%s)",
672                       expected, cmdopt.digitWidth, savedArgs));
673 
674         while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront;
675     }
676 
677     testDigitWidth(["unittest", "-n", "2", somefile_txt], 1);
678     testDigitWidth(["unittest", "-n", "2", "--digit-width" , "0", somefile_txt], 1);
679     testDigitWidth(["unittest", "-n", "10", somefile_txt], 1);
680     testDigitWidth(["unittest", "-n", "11", somefile_txt], 2);
681     testDigitWidth(["unittest", "-n", "555", somefile_txt], 3);
682     testDigitWidth(["unittest", "-n", "555", "--digit-width" , "2", somefile_txt], 2);
683     testDigitWidth(["unittest", "-n", "555", "--digit-width" , "4", somefile_txt], 4);
684     testDigitWidth(["unittest", "-l", "10", somefile_txt], 3);
685     testDigitWidth(["unittest", "-l", "10000", somefile_txt], 3);
686     testDigitWidth(["unittest", "-l", "10000", "--digit-width", "0", somefile_txt], 3);
687     testDigitWidth(["unittest", "-l", "10000", "--digit-width", "1", somefile_txt], 1);
688     testDigitWidth(["unittest", "-l", "10000", "--digit-width", "5", somefile_txt], 5);
689 }
690 
691 /** Get the rlimit current number of open files the process is allowed.
692  *
693  * This routine returns the current soft limit on the number of open files the process
694  * is allowed. This is the number returned by the command: '$ ulimit -n'.
695  *
696  * This routine translates this value to a 'uint', as tsv-split uses 'uint' for
697  * tracking output files. The rlimit 'rlim_t' type is usually 'ulong' or 'long'.
698  * RLIM_INFINITY and any value larger than 'uint.max' is translated to 'uint.max'.
699  *
700  * An exception is thrown if call to 'getrlimit' fails.
701  *
702  * Note about Windows: rlimit is a Posix construct, not available on Windows.
703  * Currently, tsv-split is written for Posix. To allow it compile on Windows, this
704  * routine returns 512 on Windows, which is the default for Windows stream I/O. This
705  * is a stop-gap solution. A more generalized 'systemCurrOpenFilesLimit' would make
706  * sense if Windows becomes primary platform. That would also require changing error
707  * messages, help, etc., to be platform specfic. At present, testing is done only on
708  * Posix platforms. For info on Windows stream I/O limits see:
709  *   https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/setmaxstdio
710  */
711 uint rlimitCurrOpenFilesLimit()
712 {
713     version (Posix)
714     {
715         import core.sys.posix.sys.resource :
716             rlim_t, rlimit, getrlimit, RLIMIT_NOFILE, RLIM_INFINITY, RLIM_SAVED_CUR;
717         import std.conv : to;
718 
719         uint currOpenFileLimit = uint.max;
720 
721         rlimit rlimitMaxOpenFiles;
722 
723         enforce(getrlimit(RLIMIT_NOFILE, &rlimitMaxOpenFiles) == 0,
724                 "Internal error: getrlimit call failed");
725 
726         if (rlimitMaxOpenFiles.rlim_cur != RLIM_INFINITY &&
727             rlimitMaxOpenFiles.rlim_cur != RLIM_SAVED_CUR &&
728             rlimitMaxOpenFiles.rlim_cur >= 0 &&
729             rlimitMaxOpenFiles.rlim_cur <= uint.max)
730         {
731             currOpenFileLimit = rlimitMaxOpenFiles.rlim_cur.to!uint;
732         }
733 
734         return currOpenFileLimit;
735     }
736     else version (Windows)
737     {
738         return 512;
739     }
740     else
741     {
742         static assert(0, "Unsupported platform.");
743     }
744 }
745 
746 /** Invokes the proper split routine based on the command line arguments.
747  *
748  * This routine is the top-level control after command line argument processing is
749  * done. It's primary job is to set up data structures and invoke the correct
750  * processing routine based on the command line arguments.
751  */
752 void tsvSplit(ref TsvSplitOptions cmdopt)
753 {
754     /* Check that the input files were setup as expected. Should at least have one
755      * input, stdin if nothing else. */
756     assert(!cmdopt.inputSources.empty);
757 
758     if (cmdopt.linesPerFile != 0)
759     {
760         splitByLineCount(cmdopt);
761     }
762     else
763     {
764         /* Randomly distribute input lines to a specified number of files. */
765 
766         auto outputFiles =
767             SplitOutputFiles(cmdopt.numFiles, cmdopt.dir, cmdopt.prefix, cmdopt.suffix,
768                              cmdopt.digitWidth, cmdopt.headerInOut, cmdopt.maxOpenOutputFiles,
769                              cmdopt.inputSources.front.header);
770 
771         if (!cmdopt.appendToExistingFiles)
772         {
773             string existingFile = outputFiles.checkIfFilesExist;
774             enforce(existingFile.length == 0,
775                     format("One or more output files already exist. Use '--a|append' to append to existing files. File: '%s'.",
776                            existingFile));
777         }
778 
779         if (cmdopt.keyFields.length == 0)
780         {
781             splitLinesRandomly(cmdopt, outputFiles);
782         }
783         else
784         {
785             splitLinesByKey(cmdopt, outputFiles);
786         }
787     }
788 }
789 
790 /** A SplitOutputFiles struct holds a collection of output files.
791  *
792  * This struct manages a collection of output files used when writing to multiple
793  * files at once. This includes constructing filenames, opening and closing files,
794  * and writing data and header lines.
795  *
796  * Both random assignment (splitLinesRandomly) and random assignment by key
797  * (splitLinesByKey) use a SplitOutputFiles struct to manage output files.
798  *
799  * The main properties of the output file set are specified in the constuctor. The
800  * exception is the header line. This is not known until the first input file is
801  * read, so it is specified in a separate 'setHeader' call.
802  *
803  * Individual output files are written to based on their zero-based index in the
804  * output collection. The caller selects the output file number to write to and
805  * calls 'writeDataLine' to write a line. The header is written if needed.
806  */
807 struct SplitOutputFiles
808 {
809     import std.conv : to;
810     import std.file : exists;
811     import std.path : buildPath;
812     import std.stdio : File;
813 
814     static struct OutputFile
815     {
816         string filename;
817         File ofile;
818         bool hasData;
819         bool isOpen;    // Track separately due to https://github.com/dlang/phobos/pull/7397
820     }
821 
822     private uint _numFiles;
823     private bool _writeHeaders;
824     private uint _maxOpenFiles;
825 
826     private OutputFile[] _outputFiles;
827     private uint _numOpenFiles = 0;
828     private string _header;
829 
830     this(uint numFiles, string dir, string filePrefix, string fileSuffix,
831          uint fileDigitWidth, bool writeHeaders, uint maxOpenFiles, string header)
832     {
833         assert(numFiles >= 2);
834         assert(maxOpenFiles >= 1);
835 
836         _numFiles = numFiles;
837         _writeHeaders = writeHeaders;
838         _maxOpenFiles = maxOpenFiles;
839         _header = header;
840 
841         _outputFiles.length = numFiles;
842 
843         /* Filename assignment. */
844         foreach (i, ref f; _outputFiles)
845         {
846             f.filename =
847                 buildPath(dir, format("%s%.*d%s", filePrefix, fileDigitWidth, i, fileSuffix));
848         }
849     }
850 
851     /* Destructor ensures all files are closed.
852      *
853      * Note: A dual check on whether the file is open is made. This is to avoid a
854      * Phobos bug where std.File doesn't properly maintain the state of open files
855      * if the File.open call fails. See: https://github.com/dlang/phobos/pull/7397.
856      */
857     ~this()
858     {
859         foreach (ref f; _outputFiles)
860         {
861             if (f.isOpen && f.ofile.isOpen)
862             {
863                 assert(_numOpenFiles >= 1);
864 
865                 f.ofile.close;
866                 f.isOpen = false;
867                 _numOpenFiles--;
868             }
869         }
870     }
871 
872     /* Check if any of the files already exist.
873      *
874      * Returns the empty string if none of the files exist. Otherwise returns the
875      * filename of the first existing file found. This is to facilitate error
876      * message generation.
877      */
878     string checkIfFilesExist()
879     {
880         foreach (f; _outputFiles) if (f.filename.exists) return f.filename;
881         return "";
882     }
883 
884     /* Picks a random file to close. Used when the open file handle limit has been
885      * reached.
886      */
887     private void closeSomeFile()
888     {
889         import std.random : uniform;
890         assert(_numOpenFiles > 0);
891 
892         immutable uint start = uniform(0, _numFiles);
893 
894         foreach (i; cycle(iota(_numFiles), start).take(_numFiles))
895         {
896             if (_outputFiles[i].isOpen)
897             {
898                 _outputFiles[i].ofile.close;
899                 _outputFiles[i].isOpen = false;
900                 _numOpenFiles--;
901 
902                 return;
903             }
904         }
905 
906         assert(false, "[SplitOutputFiles.closeSomeFile]: Could not find file to close.");
907     }
908 
909     /* Write a line to the specified file number.
910      *
911      * A header is written to the file if headers are being written and this is the
912      * first data written to the file.
913      */
914     void writeDataLine(uint fileNum, const char[] data)
915     {
916         assert(fileNum < _numFiles);
917         assert(fileNum < _outputFiles.length);
918         assert(_numOpenFiles <= _maxOpenFiles);
919 
920         OutputFile* outputFile = &_outputFiles[fileNum];
921 
922         if (!outputFile.isOpen)
923         {
924             if (_numOpenFiles == _maxOpenFiles) closeSomeFile();
925             assert(_numOpenFiles < _maxOpenFiles);
926 
927             outputFile.ofile = outputFile.filename.File("ab");
928             outputFile.isOpen = true;
929             _numOpenFiles++;
930 
931             if (!outputFile.hasData)
932             {
933                 ulong filesize = outputFile.ofile.size;
934                 outputFile.hasData = (filesize > 0 && filesize != ulong.max);
935             }
936         }
937 
938         if (_writeHeaders && !outputFile.hasData) outputFile.ofile.writeln(_header);
939 
940         outputFile.ofile.writeln(data);
941         outputFile.hasData = true;
942     }
943 }
944 
945 /** Write input lines to multiple files, randomly selecting an output file for each line.
946  */
947 void splitLinesRandomly(ref TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles)
948 {
949     import std.random : Random = Mt19937, uniform;
950     import tsv_utils.common.utils : bufferedByLine, InputSourceRange;
951 
952     /* inputSources must be an InputSourceRange and include at least stdin. */
953     assert(!cmdopt.inputSources.empty);
954     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
955 
956     auto randomGenerator = Random(cmdopt.seed);
957 
958     /* Process each line. */
959     foreach (inputStream; cmdopt.inputSources)
960     {
961         foreach (line; inputStream.file.bufferedByLine)
962         {
963             immutable uint outputFileNum = uniform(0, cmdopt.numFiles, randomGenerator);
964             outputFiles.writeDataLine(outputFileNum, line);
965         }
966     }
967 }
968 
969 /** Write input lines to multiple output files using fields as a random selection key.
970  *
971  * Each input line is written to an output file. The output file is chosen using
972  * fields as a key. Each unique key is assigned to a file. All lines having the
973  * same key are written to the same file.
974  */
975 void splitLinesByKey(ref TsvSplitOptions cmdopt, ref SplitOutputFiles outputFiles)
976 {
977     import std.algorithm : splitter;
978     import std.conv : to;
979     import std.digest.murmurhash;
980     import tsv_utils.common.utils : bufferedByLine, InputFieldReordering,
981         InputSourceRange, throwIfWindowsNewline;
982 
983     assert(cmdopt.keyFields.length > 0);
984 
985     /* inputSources must be an InputSourceRange and include at least stdin. */
986     assert(!cmdopt.inputSources.empty);
987     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
988 
989     immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys.
990 
991     /* Create a mapping for the key fields. */
992     auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields);
993 
994     /* Process each line. */
995     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
996     foreach (inputStream; cmdopt.inputSources)
997     {
998         if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1);
999 
1000         foreach (fileLineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine))
1001         {
1002             if (fileLineNum == 1) throwIfWindowsNewline(line, inputStream.name, fileLineNum);
1003 
1004             /* Murmurhash works by successively adding individual keys, then finalizing.
1005              * Adding individual keys is simpler if the full-line-as-key and individual
1006              * fields as keys cases are separated.
1007              */
1008             auto hasher = MurmurHash3!32(cmdopt.seed);
1009 
1010             if (cmdopt.keyIsFullLine)
1011             {
1012                 hasher.put(cast(ubyte[]) line);
1013             }
1014             else
1015             {
1016                 assert(keyFieldsReordering !is null);
1017 
1018                 /* Gather the key field values and assemble the key. */
1019                 keyFieldsReordering.initNewLine;
1020                 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
1021                 {
1022                     keyFieldsReordering.processNextField(fieldIndex, fieldValue);
1023                     if (keyFieldsReordering.allFieldsFilled) break;
1024                 }
1025 
1026                 enforce(keyFieldsReordering.allFieldsFilled,
1027                         format("Not enough fields in line. File: %s, Line: %s",
1028                                inputStream.name, fileLineNum));
1029 
1030                 foreach (count, key; keyFieldsReordering.outputFields.enumerate)
1031                 {
1032                     if (count > 0) hasher.put(delimArray);
1033                     hasher.put(cast(ubyte[]) key);
1034                 }
1035             }
1036 
1037             hasher.finish;
1038             immutable uint outputFileNum = hasher.get % cmdopt.numFiles;
1039             outputFiles.writeDataLine(outputFileNum, line);
1040         }
1041     }
1042 }
1043 
1044 /** Write input lines to multiple files, splitting based on line count.
1045  *
1046  * Note: readBufferSize is an argument primarily for unit test purposes. Normal uses
1047  * should use the default value.
1048  */
1049 void splitByLineCount(ref TsvSplitOptions cmdopt, const size_t readBufferSize = 1024L * 128L)
1050 {
1051     import std.file : exists;
1052     import std.path : buildPath;
1053     import std.stdio : File;
1054     import tsv_utils.common.utils : InputSourceRange;
1055 
1056     assert (readBufferSize > 0);
1057     ubyte[] readBuffer = new ubyte[readBufferSize];
1058 
1059     /* inputSources must be an InputSourceRange and include at least stdin. */
1060     assert(!cmdopt.inputSources.empty);
1061     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
1062 
1063     string header = !cmdopt.headerInOut ? "" :
1064         cmdopt.inputSources.front.header(Yes.keepTerminator);
1065     size_t nextOutputFileNum = 0;
1066     File outputFile;
1067     string outputFileName;
1068     bool isOutputFileOpen = false;           // Open file status tracked separately due to phobos bugs
1069     size_t outputFileRemainingLines;
1070 
1071     /* nextNewlineIndex finds the index of the next newline character. It is an
1072      * alternative to std.algorithm.countUntil. Invoking 'find' directly results
1073      * 'memchr' being used (faster). The current 'countUntil' implementation does
1074      * forward to find, but the way it is done avoids the memchr call optimization.
1075      */
1076     static long nextNewlineIndex(const ubyte[] buffer)
1077     {
1078         import std.algorithm : find;
1079         immutable ubyte newlineChar = '\n';
1080         immutable size_t buflen = buffer.length;
1081         immutable size_t findlen = buffer.find(newlineChar).length;
1082 
1083         return findlen > 0 ? buflen - findlen : -1;
1084     }
1085 
1086     foreach (inputStream; cmdopt.inputSources)
1087     {
1088         foreach (ref ubyte[] inputChunk; inputStream.file.byChunk(readBuffer))
1089         {
1090             size_t nextOutputChunkStart = 0;
1091             auto remainingInputChunk = inputChunk[nextOutputChunkStart .. $];
1092 
1093             while (!remainingInputChunk.empty)
1094             {
1095                 /* See if the next output file needs to be opened. */
1096                 if (!isOutputFileOpen)
1097                 {
1098                     outputFileName =
1099                         buildPath(cmdopt.dir,
1100                                   format("%s%.*d%s", cmdopt.prefix,
1101                                          cmdopt.digitWidth, nextOutputFileNum, cmdopt.suffix));
1102 
1103                     enforce(cmdopt.appendToExistingFiles || !outputFileName.exists,
1104                             format("Output file already exists. Use '--a|append' to append to existing files. File: '%s'.",
1105                                    outputFileName));
1106 
1107                     outputFile = outputFileName.File("ab");
1108                     outputFile.setvbuf(1024L * 64L, _IOFBF);
1109                     isOutputFileOpen = true;
1110                     ++nextOutputFileNum;
1111                     outputFileRemainingLines = cmdopt.linesPerFile;
1112 
1113                     if (cmdopt.headerInOut)
1114                     {
1115                         ulong filesize = outputFile.size;
1116                         if (filesize == 0 || filesize == ulong.max) outputFile.rawWrite(header);
1117                     }
1118                 }
1119 
1120                 /* Find more newlines for the current output file. */
1121 
1122                 assert(outputFileRemainingLines > 0);
1123 
1124                 size_t nextOutputChunkEnd = nextOutputChunkStart;
1125 
1126                 while (outputFileRemainingLines != 0 && !remainingInputChunk.empty)
1127                 {
1128                     /* Note: newLineIndex is relative to 'remainingInputChunk', not
1129                      * 'inputChunk'. Updates to variables referring to 'inputChunk'
1130                      * need to reflect this. In particular, 'nextOutputChunkEnd'.
1131                      */
1132                     immutable newlineIndex = nextNewlineIndex(remainingInputChunk);
1133 
1134                     if (newlineIndex == -1)
1135                     {
1136                         nextOutputChunkEnd = inputChunk.length;
1137                     }
1138                     else
1139                     {
1140                         --outputFileRemainingLines;
1141                         nextOutputChunkEnd += (newlineIndex + 1);
1142                     }
1143 
1144                     remainingInputChunk = inputChunk[nextOutputChunkEnd .. $];
1145                 }
1146 
1147                 assert(nextOutputChunkStart < nextOutputChunkEnd);
1148                 assert(nextOutputChunkEnd <= inputChunk.length);
1149 
1150                 outputFile.rawWrite(inputChunk[nextOutputChunkStart .. nextOutputChunkEnd]);
1151 
1152                 if (outputFileRemainingLines == 0)
1153                 {
1154                     outputFile.close;
1155                     isOutputFileOpen = false;
1156                 }
1157 
1158                 nextOutputChunkStart = nextOutputChunkEnd;
1159 
1160                 assert(remainingInputChunk.length == inputChunk.length - nextOutputChunkStart);
1161             }
1162         }
1163     }
1164 }
1165 
1166 /* splitByLineCount unit tests.
1167  *
1168  * These tests are primarily for buffer management. There are edge cases involving the
1169  * interaction buffer size, input file size, lines-per-file, and newline placement
1170  * that are difficult to test against the executable.
1171  */
1172 unittest
1173 {
1174     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
1175     import std.algorithm : min;
1176     import std.array : appender;
1177     import std.conv : to;
1178     import std.file : exists, mkdir, rmdirRecurse;
1179     import std.path : buildPath;
1180     import std.process : escapeShellCommand, executeShell;
1181 
1182     /* Test setup
1183      *
1184      * A set of twenty file input files is created, with names: input_NxM.txt, where
1185      * N is the number of characters in each row and M is the number of rows (lines).
1186      * The resulting files are put in the "lc_input" directory ('inputDir' variable)
1187      * and have names:
1188      *    input_0x2.txt, input_0x3.txt, ... input_5x5.txt.
1189      *
1190      * A standalone block of code produces the expected result files for splitting an
1191      * input file into a set of output files. This duplicates the splitByLineCount
1192      * output. This is done for lines-per-file counts 1 to 5. Each result set is place
1193      * ina subdirectory under "lc_expected" ('expectedDir' variable). Subdirectories
1194      * have names like: "0x2_by_1", "0x3_by_1", ..., "5x5_by_4".
1195      *
1196      * splitByLine is called for all the same input files and lines-per-file settings used
1197      * to produce the expected output. This is done via testSplitByLineCount, which calls
1198      * command line argument processing and splitByLine, similar to how the main program
1199      * works. The results are written to a subdirectory. The subdirectory is compared to
1200      * the expected output directory using the system 'diff' command.
1201      *
1202      * splitByLine is multiple times for each expected output case. The different calls
1203      * iterate over a series of small ReadBufferSizes. This is how tests for edge cases
1204      * in the readBufferSize vs line lengths, newline placement, etc., is accomplished.
1205      *
1206      * Note: One way to understand what is going on is to comment out the line:
1207      *
1208      *    scope(exit) testDir.rmdirRecurse;
1209      *
1210      * Then run the test (e.g. 'make test') and look at the directory structure left
1211      * behind. Print out the 'testDir' directory to see where it is located.
1212      */
1213 
1214     /* testSplitByLineCount acts as a surrogate for main() and tsvSplit(). It makes the
1215      * call to splitByLineCount and calls 'diff' to compare the output directory to the
1216      * expected directory. An assert is thrown if the directories do not match.
1217      */
1218     static void testSplitByLineCount(string[] cmdArgs, string expectedDir,
1219                                  size_t readBufferSize = 1024L * 512L)
1220     {
1221         import std.array : appender;
1222 
1223         assert(cmdArgs.length > 0, "[testSplitByLineCount] cmdArgs must not be empty.");
1224 
1225         auto formatAssertMessage(T...)(string msg, T formatArgs)
1226         {
1227             auto formatString = "[testSplitByLineCount] %s: " ~ msg;
1228             return format(formatString, cmdArgs[0], formatArgs);
1229         }
1230 
1231         TsvSplitOptions cmdopt;
1232         auto savedCmdArgs = cmdArgs.to!string;
1233         auto r = cmdopt.processArgs(cmdArgs);
1234         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
1235         assert(cmdopt.linesPerFile != 0, "[testSplitByLineCount] --lines-per-file is required.");
1236         assert(!cmdopt.dir.empty, "[testSplitByLineCount] --dir is required.");
1237 
1238         splitByLineCount(cmdopt, readBufferSize);
1239 
1240         /* Diff command setup. */
1241         auto diffCmdArgs = ["diff", expectedDir, cmdopt.dir];
1242         auto diffResult = executeShell(escapeShellCommand(diffCmdArgs));
1243         assert(diffResult.status == 0,
1244                format("[testSplitByLineCount]\n  cmd: %s\n  readBufferSize: %d\n  expectedDir: %s\n------ Diff ------%s\n-------",
1245                       savedCmdArgs, readBufferSize, expectedDir, diffResult.output));
1246     }
1247 
1248     auto testDir = makeUnittestTempDir("tsv_split_bylinecount");
1249     scope(exit) testDir.rmdirRecurse;
1250 
1251     auto inputDir = buildPath(testDir, "lc_input");
1252     auto outputDir = buildPath(testDir, "lc_output");
1253     auto expectedDir = buildPath(testDir, "lc_expected");
1254 
1255     mkdir(inputDir);
1256     mkdir(outputDir);
1257     mkdir(expectedDir);
1258 
1259     static string buildInputFilePath(string dir, long inputLineLength, long inputFileNumLines)
1260     {
1261         return buildPath(dir, format("input_%dx%d.txt", inputLineLength, inputFileNumLines));
1262     }
1263 
1264     string[5] outputRowData =
1265         [
1266             "abcde",
1267             "fghij",
1268             "klmno",
1269             "pqrst",
1270             "uvwxy"
1271         ];
1272 
1273     /* The main test loop. Iterates over input line lengths, numbers of rows,
1274      * lines-per-file, and finally readBufferSize lengths. All combos are tested.
1275      */
1276     foreach (inputLineLength; 0 .. 6)
1277     {
1278         foreach (inputFileNumLines; 2 .. 6)
1279         {
1280             auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines);
1281 
1282             {
1283                 auto ofile = inputFile.File("wb");
1284                 auto output = appender!(char[])();
1285                 foreach (m; 0 .. inputFileNumLines)
1286                 {
1287                     put(output, outputRowData[m][0 .. inputLineLength]);
1288                     put(output, '\n');
1289                 }
1290                 ofile.write(output.data);
1291                 ofile.close;
1292             }
1293 
1294             /* Iterate over the different lines-per-file lengths.
1295              * - Create an expected output directory and files for each.
1296              * - Test with different readBufferSize values.
1297              */
1298             foreach (outputFileNumLines; 1 .. min(5, inputFileNumLines))
1299             {
1300                 auto expectedSubDir =
1301                     buildPath(expectedDir, format("%dx%d_by_%d", inputLineLength,
1302                                                   inputFileNumLines, outputFileNumLines));
1303                 mkdir(expectedSubDir);
1304 
1305                 size_t filenum = 0;
1306                 size_t linesWritten = 0;
1307                 while (linesWritten < inputFileNumLines)
1308                 {
1309                     auto expectedFile = buildPath(expectedSubDir, format("part_%d.txt", filenum));
1310                     auto f = expectedFile.File("wb");
1311                     auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten);
1312                     foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite])
1313                     {
1314                         f.writeln(line[0 .. inputLineLength]);
1315                     }
1316                     linesWritten += linesToWrite;
1317                     ++filenum;
1318                     f.close;
1319                 }
1320 
1321                 /* Test the different readBufferSizes.
1322                  * - An output directory is created for the run and deleted afterward.
1323                  * - First test the default size.
1324                  * - Then iterate overs small readBufferSize values.
1325                  */
1326                 auto outputSubDir =
1327                     buildPath(outputDir, format("%dx%d_by_%d", inputLineLength,
1328                                                 inputFileNumLines, outputFileNumLines));
1329                 mkdir(outputSubDir);
1330 
1331                 testSplitByLineCount(
1332                     ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir,
1333                      "--digit-width", "1", inputFile],
1334                     expectedSubDir);
1335 
1336                 outputSubDir.rmdirRecurse;
1337 
1338                 foreach (readBufSize; 1 .. 8)
1339                 {
1340                      mkdir(outputSubDir);
1341 
1342                      testSplitByLineCount(
1343                          ["test", "--lines-per-file", outputFileNumLines.to!string, "--dir", outputSubDir,
1344                           "--digit-width", "1", inputFile],
1345                          expectedSubDir, readBufSize);
1346 
1347                      outputSubDir.rmdirRecurse;
1348                 }
1349             }
1350         }
1351     }
1352 
1353     {
1354         /* Tests for the special case where readBufferSize is smaller than the header
1355          * line. We'll reuse the input_5x4.txt input file and write 1 line-per-file.
1356          */
1357         immutable inputLineLength = 5;
1358         immutable inputFileNumLines = 4;
1359         immutable outputFileNumLines = 1;
1360 
1361         auto inputFile = buildInputFilePath(inputDir, inputLineLength, inputFileNumLines);
1362         assert(inputFile.exists);
1363 
1364         auto expectedSubDirHeader =
1365             buildPath(expectedDir, format("%dx%d_by_%d_header", inputLineLength,
1366                                           inputFileNumLines, outputFileNumLines));
1367 
1368         auto expectedSubDirHeaderInOnly =
1369             buildPath(expectedDir, format("%dx%d_by_%d_header_in_only", inputLineLength,
1370                                           inputFileNumLines, outputFileNumLines));
1371 
1372         mkdir(expectedSubDirHeader);
1373         mkdir(expectedSubDirHeaderInOnly);
1374 
1375         /* Generate the expected results. Cheat by starting with linesWritten = 1. This
1376          * automatically excludes the header line, but keeps the loop code consistent
1377          * with the main test loop.
1378          */
1379         size_t filenum = 0;
1380         size_t linesWritten = 1;
1381         while (linesWritten < inputFileNumLines)
1382         {
1383             auto expectedFileHeader = buildPath(expectedSubDirHeader, format("part_%d.txt", filenum));
1384             auto expectedFileHeaderInOnly = buildPath(expectedSubDirHeaderInOnly,
1385                                                       format("part_%d.txt", filenum));
1386             auto fHeader = expectedFileHeader.File("wb");
1387             auto fHeaderInOnly = expectedFileHeaderInOnly.File("wb");
1388             auto linesToWrite = min(outputFileNumLines, inputFileNumLines - linesWritten);
1389 
1390             fHeader.writeln(outputRowData[0][0 .. inputLineLength]);
1391             foreach (line; outputRowData[linesWritten .. linesWritten + linesToWrite])
1392             {
1393                 fHeader.writeln(line[0 .. inputLineLength]);
1394                 fHeaderInOnly.writeln(line[0 .. inputLineLength]);
1395             }
1396             linesWritten += linesToWrite;
1397             ++filenum;
1398             fHeader.close;
1399             fHeaderInOnly.close;
1400         }
1401 
1402         /* Now run the tests. */
1403         auto outputSubDirHeader =
1404             buildPath(outputDir, format("%dx%d_by_%d_header", inputLineLength,
1405                                         inputFileNumLines, outputFileNumLines));
1406         auto outputSubDirHeaderInOnly =
1407             buildPath(outputDir, format("%dx%d_by_%d_header_in_only", inputLineLength,
1408                                         inputFileNumLines, outputFileNumLines));
1409 
1410         foreach (readBufSize; 1 .. 6)
1411         {
1412             mkdir(outputSubDirHeader);
1413             mkdir(outputSubDirHeaderInOnly);
1414 
1415             testSplitByLineCount(
1416                 ["test", "--header", "--lines-per-file", outputFileNumLines.to!string,
1417                  "--dir", outputSubDirHeader, "--digit-width", "1", inputFile],
1418                 expectedSubDirHeader, readBufSize);
1419 
1420             testSplitByLineCount(
1421                 ["test", "--header-in-only", "--lines-per-file", outputFileNumLines.to!string,
1422                  "--dir", outputSubDirHeaderInOnly, "--digit-width", "1", inputFile],
1423                 expectedSubDirHeaderInOnly, readBufSize);
1424 
1425             outputSubDirHeader.rmdirRecurse;
1426             outputSubDirHeaderInOnly.rmdirRecurse;
1427         }
1428     }
1429 }