1 /**
2 A variant of the unix 'cut' program, with the ability to reorder fields.
3 
4 tsv-select is a variation on the Unix 'cut' utility, with the added ability to reorder
5 fields. Lines are read from files or standard input and split on a delimiter character.
6 Fields are written to standard output in the order listed. Fields can be listed more
7 than once, and fields not listed can be written out as a group.
8 
9 This program is intended both as a useful utility and a D programming language example.
10 Functionality and constructs used include command line argument processing, file I/O,
11 exception handling, ranges, tuples and strings, templates, universal function call syntax
12 (UFCS), lambdas and functional programming constructs. Comments are more verbose than
13 typical to shed light on D programming constructs, but not to the level of a tutorial.
14 
15 Copyright (c) 2015-2021, eBay Inc.
16 Initially written by Jon Degenhardt
17 
18 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
19 */
20 
21 module tsv_utils.tsv_select;   // Module name defaults to file name, but hyphens not allowed, so set it here.
22 
23 // Imports used by multiple routines. Others imports made in local context.
24 import std.exception : enforce;
25 import std.range;
26 import std.stdio;
27 import std.typecons : tuple, Tuple;
28 
29 // 'Heredoc' style help text. When printed it is followed by a getopt formatted option list.
30 immutable helpText = q"EOS
31 Synopsis: tsv-select [options] [file...]
32 
33 tsv-select reads files or standard input and writes selected fields to
34 standard output. Fields are written in the order listed. This is similar
35 to Unix 'cut', but with the ability to reorder fields.
36 
37 Fields can be specified by field number or, for files with header lines,
38 by field name. Use '--H|header' to enable selection by name. This also
39 manages header lines from multiple files, retaining only the first header.
40 
41 Field numbers start with one. The field list is comma separated. Ranges
42 can be used, and wildcards can be used when specifying fields by name.
43 
44 Fields can be dropped using '--e|exclude'. Fields not included in the
45 '--f|fields' option can be selected as a group using '--r|rest'.
46 
47 Examples:
48 
49    # Selecting fields. Output is in the order listed
50    tsv-select -H date,time file.tsv
51    tsv-select -f 2,1 file.tsv
52    tsv-select -f 5-7,2,9-11
53    tsv-select -H -f '*_date' file.tsv
54 
55    # Dropping fields
56    tsv-select --exclude 1 file.tsv
57    tsv-select -H -e date,time file.tsv
58 
59    # Move fields to the front or the back
60    tsv-select -f 1 --rest first file.tsv  # Move field 1 to the end
61    tsv-select -H -f date --rest last      # Move 'date' field to the front
62 
63    # Read multiple files, keep the header from only the first
64    tsv-select data*.tsv -H --fields 1,2,4-7,14
65 
66 Use '--help-verbose' for detailed information. Use '--help-fields' for
67 details about field lists and field names.
68 
69 Options:
70 EOS";
71 
72 immutable helpTextVerbose = q"EOS
73 Synopsis: tsv-select [options] [file...]
74 
75 tsv-select reads files or standard input and writes selected fields to
76 standard output. Fields are written in the order listed. This is similar
77 to Unix 'cut', but with the ability to reorder fields.
78 
79 Fields can be specified by field number or, for files with header lines,
80 by field name. Use '--H|header' to enable selection by name. This also
81 manages header lines from multiple files, retaining only the first header.
82 
83 Field numbers start with one. The field list is comma separated. Fields
84 can be repeated and ranges can be used. Wildcards can be used when
85 specifying fields by name, and escapes can be used to specify fields names
86 containing special characters. Run '--help-fields' for details.
87 
88 Fields can be excluded using '--e|exclude'. All fields not excluded are
89 output. Fields not included in the '--f|fields' option can be selected as
90 a group using '--r|rest'. '--f|fields' and '--r|rest' can be used with
91  '--e|exclude' to reorder non-excluded fields.
92 
93 Examples:
94 
95    # Keep the first field from two files
96    tsv-select -f 1 file1.tsv file2.tsv
97 
98    # Keep fields 1 and 2, retaining the header from only the first file
99    tsv-select -H -f 1,2 file1.tsv file2.tsv
100 
101    # Keep the 'time' field
102    tsv-select -H -f time file1.tsv
103 
104    # Keep all fields ending '_date' or '_time'
105    tsv-select -H -f '*_date,*_time' file.tsv
106 
107    # Drop all the '*_time' fields
108    tsv-select -H --exclude '*_time' file.tsv
109 
110    # Field reordering and field ranges
111    tsv-select -f 3,2,1 file.tsv
112    tsv-select -f 1,4-7,11 file.tsv
113    tsv-select -f 1,7-4,11 file.tsv
114 
115    # Repeating fields
116    tsv-select -f 1,2,1 file.tsv
117    tsv-select -f 1-3,3-1 file.tsv
118 
119    # Move fields to the front
120    tsv-select -f 5 --rest last file.tsv
121    tsv-select -H -f Date,Time --rest last file.tsv
122 
123    # Move fields to the end
124    tsv-select -f 4,5 --rest first file.tsv
125    tsv-select -f '*_time' --rest first file.tsv
126 
127    # Move field 2 to the front and drop fields 10-15
128    tsv-select -f 2 -e 10-15 file.tsv
129 
130    # Move field 2 to the end, dropping fields 10-15
131    tsv-select -f 2 -rest first -e 10-15 file.tsv
132 
133 Use '--help-fields' for detailed help on field lists.
134 
135 Notes:
136 * One of '--f|fields' or '--e|exclude' is required.
137 * Fields specified by '--f|fields' and '--e|exclude' cannot overlap.
138 * When '--f|fields' and '--e|exclude' are used together, the effect is to
139   specify '--rest last'. This can be overridden by using '--rest first'.
140 * Each input line must be long enough to contain all fields specified
141   with '--f|fields'. This is not necessary for '--e|exclude' fields.
142 * Specifying names of fields containing special characters may require
143   escaping the special characters. See '--help-fields' for details.
144 * Output is buffered by default to improve performance. Use
145   '--line-buffered' to have each line immediately written out.
146 
147 Options:
148 EOS";
149 
150 /** Container for command line options.
151  */
152 struct TsvSelectOptions
153 {
154     import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange, LineBuffered,
155         ReadHeader;
156 
157     // The allowed values for the --rest option.
158     enum RestOption { none, first, last};
159 
160     string programName;                 /// Program name
161     ByLineSourceRange!() inputSources;  /// Input Files
162     bool hasHeader = false;             /// --H|header
163     char delim = '\t';                  /// --d|delimiter
164     bool lineBuffered = false;          /// --line-buffered
165     RestOption restArg;                 /// --rest first|last (none is hidden default)
166     size_t[] fields;                    /// Derived from --f|fields
167     bool[] excludedFieldsTable;         /// Derived. Lookup table for excluded fields.
168 
169     /** Process command line arguments (getopt cover).
170      *
171      * processArgs calls getopt to process command line arguments. It does any additional
172      * validation and parameter derivations needed. A tuple is returned. First value is
173      * true if command line arguments were successfully processed and execution should
174      * continue, or false if an error occurred or the user asked for help. If false, the
175      * second value is the appropriate exit code (0 or 1).
176      *
177      * Returning true (execution continues) means args have been validated and derived
178      * values calculated. In addition, field indices have been converted to zero-based.
179      */
180     auto processArgs (ref string[] cmdArgs)
181     {
182         import std.algorithm : any, each, maxElement;
183         import std.array : split;
184         import std.conv : to;
185         import std.format : format;
186         import std.getopt;
187         import std.path : baseName, stripExtension;
188         import std.typecons : Yes, No;
189         import tsv_utils.common.fieldlist;
190         import tsv_utils.common.utils : throwIfWindowsNewline;
191 
192         bool helpVerbose = false;           // --help-verbose
193         bool helpFields = false;            // --help-fields
194         bool versionWanted = false;         // --V|version
195         string fieldsArg;                   // --f|fields
196         string excludedFieldsArg;           // --e|exclude
197 
198         string fieldsOptionString = "f|fields";
199         string excludedFieldsOptionString = "e|exclude";
200 
201         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
202 
203         try
204         {
205             arraySep = ",";    // Use comma to separate values in command line options
206             auto r = getopt(
207                 cmdArgs,
208                 "help-verbose",
209                 "              Print more detailed help.",
210                 &helpVerbose,
211 
212                 "help-fields",
213                 "              Print help on specifying fields.",
214                 &helpFields,
215 
216                 std.getopt.config.caseSensitive,
217                 "H|header",
218                 "              Treat the first line of each file as a header.",
219                 &hasHeader,
220                 std.getopt.config.caseInsensitive,
221 
222                 fieldsOptionString,
223                 "<field-list>  Fields to retain. Fields are output in the order listed.",
224                 &fieldsArg,
225 
226                 excludedFieldsOptionString,
227                 "<field-list>  Fields to exclude.",
228                 &excludedFieldsArg,
229 
230                 "r|rest",
231                 "first|last    Output location for fields not included in '--f|fields'.",
232                 &restArg,
233 
234                 "d|delimiter",
235                 "CHR           Character to use as field delimiter. Default: TAB. (Single byte UTF-8 characters only.)",
236                 &delim,
237 
238                 "line-buffered",
239                 "              Immediately output every line.",
240                 &lineBuffered,
241 
242                 std.getopt.config.caseSensitive,
243                 "V|version",
244                 "              Print version information and exit.",
245                 &versionWanted,
246                 std.getopt.config.caseInsensitive,
247                 );
248 
249             if (r.helpWanted)
250             {
251                 defaultGetoptPrinter(helpText, r.options);
252                 return tuple(false, 0);
253             }
254             else if (helpVerbose)
255             {
256                 defaultGetoptPrinter(helpTextVerbose, r.options);
257                 return tuple(false, 0);
258             }
259             else if (helpFields)
260             {
261                 writeln(fieldListHelpText);
262                 return tuple(false, 0);
263             }
264             else if (versionWanted)
265             {
266                 import tsv_utils.common.tsvutils_version;
267                 writeln(tsvutilsVersionNotice("tsv-select"));
268                 return tuple(false, 0);
269             }
270 
271             /* Remaining command line args are files. Use standard input if files
272              * were not provided. Truncate cmdArgs to consume the arguments.
273              */
274             string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
275             cmdArgs.length = 1;
276 
277             /* Validation and derivations - Do as much validation prior to header line
278              * processing as possible (avoids waiting on stdin).
279              *
280              * Note: fields and excludedFields depend on header line processing, but
281              * fieldsArg and excludedFieldsArg can be used to detect whether the
282              * command line argument was specified.
283              */
284 
285             enforce(!fieldsArg.empty || !excludedFieldsArg.empty,
286                     "One of '--f|fields' or '--e|exclude' is required.");
287 
288             string[] headerFields;
289 
290             /* fieldListArgProcessing encapsulates the field list processing. It is
291              * called prior to reading the header line if headers are not being used,
292              * and after if headers are being used.
293              */
294             void fieldListArgProcessing()
295             {
296                 if (!fieldsArg.empty)
297                 {
298                     fields = fieldsArg
299                         .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(
300                             hasHeader, headerFields, fieldsOptionString)
301                         .array;
302                 }
303 
304                 size_t[] excludedFields;
305 
306                 if (!excludedFieldsArg.empty)
307                 {
308                     excludedFields = excludedFieldsArg
309                         .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(
310                             hasHeader, headerFields, excludedFieldsOptionString)
311                         .array;
312                 }
313 
314                 if (excludedFields.length > 0)
315                 {
316                     /* Make sure selected and excluded fields do not overlap. */
317                     foreach (e; excludedFields)
318                     {
319                         foreach (f; fields)
320                         {
321                             enforce(e != f, "'--f|fields' and '--e|exclude' have overlapping fields.");
322                         }
323                     }
324 
325                     /* '--exclude' changes '--rest' default to 'last'. */
326                     if (restArg == RestOption.none) restArg = RestOption.last;
327 
328                     /* Build the excluded field lookup table.
329                      *
330                      * Note: Users won't have any reason to expect memory is allocated based
331                      * on the max field number. However, users might pick arbitrarily large
332                      * numbers when trimming fields. So, limit the max field number to something
333                      * big but reasonable (more than 1 million). The limit can be raised if use
334                      * cases arise.
335                      */
336                     size_t maxExcludedField = excludedFields.maxElement;
337                     size_t maxAllowedExcludedField = 1024 * 1024;
338 
339                     enforce(maxExcludedField < maxAllowedExcludedField,
340                             format("Maximum allowed '--e|exclude' field number is %d.",
341                                    maxAllowedExcludedField));
342 
343                     excludedFieldsTable.length = maxExcludedField + 1;          // Initialized to false
344                     foreach (e; excludedFields) excludedFieldsTable[e] = true;
345                 }
346             }
347 
348             if (!hasHeader) fieldListArgProcessing();
349 
350             /*
351              * Create the byLineSourceRange and perform header line processing.
352              */
353             immutable LineBuffered isLineBuffered = lineBuffered ? Yes.lineBuffered : No.lineBuffered;
354             immutable ReadHeader useReadHeader = hasHeader ? Yes.readHeader : No.readHeader;
355             inputSources = byLineSourceRange(filepaths, isLineBuffered, useReadHeader);
356 
357             if (hasHeader)
358             {
359                 if (!inputSources.front.byLine.empty)
360                 {
361                     throwIfWindowsNewline(inputSources.front.byLine.front, inputSources.front.name, 1);
362                     headerFields = inputSources.front.byLine.front.split(delim).to!(string[]);
363                 }
364 
365                 fieldListArgProcessing();
366             }
367 
368         }
369         catch (Exception exc)
370         {
371             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
372             return tuple(false, 1);
373         }
374         return tuple(true, 0);
375     }
376 }
377 
378 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
379 
380 /** Main program.
381  */
382 int main(string[] cmdArgs)
383 {
384     /* When running in DMD code coverage mode, turn on report merging. */
385     version(D_Coverage) version(DigitalMars)
386     {
387         import core.runtime : dmd_coverSetMerge;
388         dmd_coverSetMerge(true);
389     }
390 
391     TsvSelectOptions cmdopt;
392     const r = cmdopt.processArgs(cmdArgs);
393     if (!r[0]) return r[1];
394     version(LDC_Profile)
395     {
396         import ldc.profile : resetAll;
397         resetAll();
398     }
399     try
400     {
401         /* Invoke the tsvSelect template matching the --rest option chosen. Option args
402          * are removed by command line processing (getopt). The program name and any files
403          * remain. Pass the files to tsvSelect.
404          */
405         final switch (cmdopt.restArg)
406         {
407         case TsvSelectOptions.RestOption.none:
408             tsvSelect!(RestLocation.none)(cmdopt);
409             break;
410         case TsvSelectOptions.RestOption.first:
411             tsvSelect!(RestLocation.first)(cmdopt);
412             break;
413         case TsvSelectOptions.RestOption.last:
414             tsvSelect!(RestLocation.last)(cmdopt);
415             break;
416         }
417     }
418     catch (Exception exc)
419     {
420         stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
421         return 1;
422     }
423 
424     return 0;
425 }
426 
427 // tsvSelect
428 
429 /** Enumeration of the different specializations of the tsvSelect template.
430  *
431  * RestLocation is logically equivalent to the TsvSelectOptions.RestOption enum. It
432  * is used by main to choose the appropriate tsvSelect template instantiation to call. It
433  * is distinct from the TsvSelectOptions enum to separate it from the end-user UI. The
434  * TsvSelectOptions version specifies the text of allowed values in command line arguments.
435  */
436 enum RestLocation { none, first, last };
437 
438 /** tsvSelect does the primary work of the tsv-select program.
439  *
440  * Input is read line by line, extracting the listed fields and writing them out in the order
441  * specified. An exception is thrown on error.
442  *
443  * This function is templatized with instantiations for the different --rest options. This
444  * avoids repeatedly running the same if-tests inside the inner loop. The main function
445  * instantiates this function three times, once for each of the --rest options. It results
446  * in a larger program, but is faster. Run-time improvements of 25% were measured compared
447  * to the non-templatized version. (Note: 'cte' stands for 'compile time evaluation'.)
448  */
449 
450 void tsvSelect(RestLocation rest)(ref TsvSelectOptions cmdopt)
451 {
452     import tsv_utils.common.utils: BufferedOutputRange,
453         ByLineSourceRange, InputFieldReordering, LineBuffered, throwIfWindowsNewline;
454     import std.algorithm: splitter;
455     import std.array : appender, Appender;
456     import std.format: format;
457     import std.range;
458 
459     // Ensure the correct template instantiation was called.
460     static if (rest == RestLocation.none)
461         assert(cmdopt.restArg == TsvSelectOptions.RestOption.none);
462     else static if (rest == RestLocation.first)
463         assert(cmdopt.restArg == TsvSelectOptions.RestOption.first);
464     else static if (rest == RestLocation.last)
465         assert(cmdopt.restArg == TsvSelectOptions.RestOption.last);
466     else
467         static assert(false, "rest template argument does not match cmdopt.restArg.");
468 
469     /* Check that the input files were setup as expected. Should at least have one
470      * input, stdin if nothing else, and newlines removed from the byLine range.
471      */
472     assert(!cmdopt.inputSources.empty);
473     static assert(is(typeof(cmdopt.inputSources) == ByLineSourceRange!(No.keepTerminator)));
474 
475     /* The algorithm here assumes RestOption.none is not used with --exclude-fields. */
476     assert(cmdopt.excludedFieldsTable.length == 0 || rest != RestLocation.none);
477 
478     /* InputFieldReordering copies select fields from an input line to a new buffer.
479      * The buffer is reordered in the process.
480      */
481     auto fieldReordering = new InputFieldReordering!char(cmdopt.fields);
482 
483     /* Fields not on the --fields list are added to a separate buffer so they can be
484      * output as a group (the --rest option). This is done using an 'Appender', which
485      * is faster than the ~= operator. The Appender is passed a GC allocated buffer
486      * that grows as needed and is reused for each line. Typically it'll grow only
487      * on the first line.
488      */
489     static if (rest != RestLocation.none)
490     {
491         auto leftOverFieldsAppender = appender!(char[][]);
492     }
493 
494     /* BufferedOutputRange (from common/utils.d) is a performance improvement over
495      * writing directly to stdout.
496      */
497     immutable LineBuffered isLineBuffered = cmdopt.lineBuffered ? Yes.lineBuffered : No.lineBuffered;
498     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout, isLineBuffered);
499 
500     /* Read each input file (or stdin) and iterate over each line.
501      */
502     foreach (fileNum, inputStream; cmdopt.inputSources.enumerate)
503     {
504         foreach (lineNum, line; inputStream.byLine.enumerate(1))
505         {
506             if (lineNum == 1) throwIfWindowsNewline(line, inputStream.name, lineNum);
507 
508             if (lineNum == 1 && fileNum > 0 && cmdopt.hasHeader)
509             {
510                 continue;   // Drop the header line from all but the first file.
511             }
512 
513             static if (rest != RestLocation.none)
514             {
515                 leftOverFieldsAppender.clear;
516 
517                 /* Track the field location in the line. This enables bulk appending
518                  * after the last specified field has been processed.
519                  */
520                 size_t nextFieldStart = 0;
521             }
522 
523             fieldReordering.initNewLine;
524 
525             foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
526             {
527                 static if (rest == RestLocation.none)
528                 {
529                     fieldReordering.processNextField(fieldIndex, fieldValue);
530                     if (fieldReordering.allFieldsFilled) break;
531                 }
532                 else
533                 {
534                     /* Processing with 'rest' fields. States:
535                      *  - Excluded fields and specified fields remain
536                      *  - Only specified fields remain
537                      *  - Only excluded fields remain
538                      */
539 
540                     nextFieldStart += fieldValue.length + 1;
541                     bool excludedFieldsRemain = fieldIndex < cmdopt.excludedFieldsTable.length;
542                     immutable isExcluded = excludedFieldsRemain && cmdopt.excludedFieldsTable[fieldIndex];
543 
544                     if (!isExcluded)
545                     {
546                         immutable numMatched = fieldReordering.processNextField(fieldIndex, fieldValue);
547 
548                         if (numMatched == 0) leftOverFieldsAppender.put(fieldValue);
549                     }
550                     else if (fieldIndex + 1 == cmdopt.excludedFieldsTable.length)
551                     {
552                         excludedFieldsRemain = false;
553                     }
554 
555                     if (fieldReordering.allFieldsFilled && !excludedFieldsRemain)
556                     {
557                         /* Processed all specified fields. Bulk append any fields
558                          * remaining on the line. Cases:
559                          * - Current field is last field:
560                          */
561                         if (nextFieldStart <= line.length)
562                         {
563                             leftOverFieldsAppender.put(line[nextFieldStart .. $]);
564                         }
565 
566                         break;
567                     }
568                 }
569             }
570 
571             // Finished with all fields in the line.
572             enforce(fieldReordering.allFieldsFilled,
573                     format("Not enough fields in line. File: %s,  Line: %s",
574                            inputStream.name, lineNum));
575 
576             // Write the re-ordered line.
577 
578             static if (rest == RestLocation.first)
579             {
580                 if (leftOverFieldsAppender.data.length > 0)
581                 {
582                     bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim);
583                     if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim);
584                 }
585             }
586 
587             bufferedOutput.joinAppend(fieldReordering.outputFields, cmdopt.delim);
588 
589             static if (rest == RestLocation.last)
590             {
591                 if (leftOverFieldsAppender.data.length > 0)
592                 {
593                     if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim);
594                     bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim);
595                 }
596             }
597 
598             bufferedOutput.appendln;
599 
600             /* Send the first line of the first file immediately. This helps detect
601              * errors quickly in multi-stage unix pipelines. Note that tsv-select may
602              * have been sent one line from an upstream process, usually a header line.
603              */
604             if (lineNum == 1 && fileNum == 0) bufferedOutput.flush;
605         }
606     }
607 }