1 /**
2 A variant of the unix 'cut' program, with the ability to reorder fields.
3 
4 tsv-select is a variation on the Unix 'cut' utility, with the added ability to reorder
5 fields. Lines are read from files or standard input and split on a delimiter character.
6 Fields are written to standard output in the order listed. Fields can be listed more
7 than once, and fields not listed can be written out as a group.
8 
9 This program is intended both as a useful utility and a D programming language example.
10 Functionality and constructs used include command line argument processing, file I/O,
11 exception handling, ranges, tuples and strings, templates, universal function call syntax
12 (UFCS), lambdas and functional programming constructs. Comments are more verbose than
13 typical to shed light on D programming constructs, but not to the level of a tutorial.
14 
15 Copyright (c) 2015-2020, eBay Inc.
16 Initially written by Jon Degenhardt
17 
18 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
19 */
20 
21 module tsv_utils.tsv_select;   // Module name defaults to file name, but hyphens not allowed, so set it here.
22 
23 // Imports used by multiple routines. Others imports made in local context.
24 import std.exception : enforce;
25 import std.range;
26 import std.stdio;
27 import std.typecons : tuple, Tuple;
28 
29 // 'Heredoc' style help text. When printed it is followed by a getopt formatted option list.
30 immutable helpText = q"EOS
31 Synopsis: tsv-select [options] [file...]
32 
33 tsv-select reads files or standard input and writes selected fields to
34 standard output. Fields are written in the order listed. This is similar
35 to Unix 'cut', but with the ability to reorder fields.
36 
37 Fields can be specified by field number or, for files with header lines,
38 by field name. Use '--H|header' to enable selection by name. This also
39 manages header lines from multiple files, retaining only the first header.
40 
41 Field numbers start with one. The field list is comma separated. Ranges
42 can be used, and wildcards can be used when specifying fields by name.
43 
44 Fields can be dropped using '--e|exclude'. Fields not included in the
45 '--f|fields' option can be selected as a group using '--r|rest'.
46 
47 Examples:
48 
49    # Selecting fields. Output is in the order listed
50    tsv-select -H date,time file.tsv
51    tsv-select -f 2,1 file.tsv
52    tsv-select -f 5-7,2,9-11
53    tsv-select -H -f '*_date' file.tsv
54 
55    # Dropping fields
56    tsv-select --exclude 1 file.tsv
57    tsv-select -H -e date,time file.tsv
58 
59    # Move fields to the front or the back
60    tsv-select -f 1 --rest first file.tsv  # Move field 1 to the end
61    tsv-select -H -f date --rest last      # Move 'date' field to the front
62 
63    # Read multiple files, keep the header from only the first
64    tsv-select data*.tsv -H --fields 1,2,4-7,14
65 
66 Use '--help-verbose' for detailed information. Use '--help-fields' for
67 details about field lists and field names.
68 
69 Options:
70 EOS";
71 
72 immutable helpTextVerbose = q"EOS
73 Synopsis: tsv-select [options] [file...]
74 
75 tsv-select reads files or standard input and writes selected fields to
76 standard output. Fields are written in the order listed. This is similar
77 to Unix 'cut', but with the ability to reorder fields.
78 
79 Fields can be specified by field number or, for files with header lines,
80 by field name. Use '--H|header' to enable selection by name. This also
81 manages header lines from multiple files, retaining only the first header.
82 
83 Field numbers start with one. The field list is comma separated. Fields
84 can be repeated and ranges can be used. Wildcards can be used when
85 specifying fields by name, and escapes can be used to specify fields names
86 containing special characters. Run '--help-fields' for details.
87 
88 Fields can be excluded using '--e|exclude'. All fields not excluded are
89 output. Fields not included in the '--f|fields' option can be selected as
90 a group using '--r|rest'. '--f|fields' and '--r|rest' can be used with
91  '--e|exclude' to reorder non-excluded fields.
92 
93 Examples:
94 
95    # Keep the first field from two files
96    tsv-select -f 1 file1.tsv file2.tsv
97 
98    # Keep fields 1 and 2, retaining the header from only the first file
99    tsv-select -H -f 1,2 file1.tsv file2.tsv
100 
101    # Keep the 'time' field
102    tsv-select -H -f time file1.tsv
103 
104    # Keep all fields ending '_date' or '_time'
105    tsv-select -H -f '*_date,*_time' file.tsv
106 
107    # Drop all the '*_time' fields
108    tsv-select -H --exclude '*_time' file.tsv
109 
110    # Field reordering and field ranges
111    tsv-select -f 3,2,1 file.tsv
112    tsv-select -f 1,4-7,11 file.tsv
113    tsv-select -f 1,7-4,11 file.tsv
114 
115    # Repeating fields
116    tsv-select -f 1,2,1 file.tsv
117    tsv-select -f 1-3,3-1 file.tsv
118 
119    # Move fields to the front
120    tsv-select -f 5 --rest last file.tsv
121    tsv-select -H -f Date,Time --rest last file.tsv
122 
123    # Move fields to the end
124    tsv-select -f 4,5 --rest first file.tsv
125    tsv-select -f '*_time' --rest first file.tsv
126 
127    # Move field 2 to the front and drop fields 10-15
128    tsv-select -f 2 -e 10-15 file.tsv
129 
130    # Move field 2 to the end, dropping fields 10-15
131    tsv-select -f 2 -rest first -e 10-15 file.tsv
132 
133 Use '--help-fields' for detailed help on field lists.
134 
135 Notes:
136 * One of '--f|fields' or '--e|exclude' is required.
137 * Fields specified by '--f|fields' and '--e|exclude' cannot overlap.
138 * When '--f|fields' and '--e|exclude' are used together, the effect is to
139   specify '--rest last'. This can be overridden by using '--rest first'.
140 * Each input line must be long enough to contain all fields specified
141   with '--f|fields'. This is not necessary for '--e|exclude' fields.
142 * Specifying names of fields containing special characters may require
143   escaping the special characters. See '--help-fields' for details.
144 
145 Options:
146 EOS";
147 
148 /** Container for command line options.
149  */
150 struct TsvSelectOptions
151 {
152     import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange;
153 
154     // The allowed values for the --rest option.
155     enum RestOption { none, first, last};
156 
157     string programName;                 /// Program name
158     ByLineSourceRange!() inputSources;  /// Input Files
159     bool hasHeader = false;             /// --H|header
160     char delim = '\t';                  /// --d|delimiter
161     RestOption restArg;                 /// --rest first|last (none is hidden default)
162     size_t[] fields;                    /// Derived from --f|fields
163     bool[] excludedFieldsTable;         /// Derived. Lookup table for excluded fields.
164 
165     /** Process command line arguments (getopt cover).
166      *
167      * processArgs calls getopt to process command line arguments. It does any additional
168      * validation and parameter derivations needed. A tuple is returned. First value is
169      * true if command line arguments were successfully processed and execution should
170      * continue, or false if an error occurred or the user asked for help. If false, the
171      * second value is the appropriate exit code (0 or 1).
172      *
173      * Returning true (execution continues) means args have been validated and derived
174      * values calculated. In addition, field indices have been converted to zero-based.
175      */
176     auto processArgs (ref string[] cmdArgs)
177     {
178         import std.algorithm : any, each, maxElement;
179         import std.array : split;
180         import std.conv : to;
181         import std.format : format;
182         import std.getopt;
183         import std.path : baseName, stripExtension;
184         import std.typecons : Yes, No;
185         import tsv_utils.common.fieldlist;
186         import tsv_utils.common.utils : throwIfWindowsNewline;
187 
188         bool helpVerbose = false;           // --help-verbose
189         bool helpFields = false;            // --help-fields
190         bool versionWanted = false;         // --V|version
191         string fieldsArg;                   // --f|fields
192         string excludedFieldsArg;           // --e|exclude
193 
194         string fieldsOptionString = "f|fields";
195         string excludedFieldsOptionString = "e|exclude";
196 
197         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
198 
199         try
200         {
201             arraySep = ",";    // Use comma to separate values in command line options
202             auto r = getopt(
203                 cmdArgs,
204                 "help-verbose",
205                 "              Print more detailed help.",
206                 &helpVerbose,
207 
208                 "help-fields",
209                 "              Print help on specifying fields.",
210                 &helpFields,
211 
212                 std.getopt.config.caseSensitive,
213                 "H|header",
214                 "              Treat the first line of each file as a header.",
215                 &hasHeader,
216                 std.getopt.config.caseInsensitive,
217 
218                 fieldsOptionString,
219                 "<field-list>  Fields to retain. Fields are output in the order listed.",
220                 &fieldsArg,
221 
222                 excludedFieldsOptionString,
223                 "<field-list>  Fields to exclude.",
224                 &excludedFieldsArg,
225 
226                 "r|rest",
227                 "first|last    Output location for fields not included in '--f|fields'.",
228                 &restArg,
229 
230                 "d|delimiter",
231                 "CHR           Character to use as field delimiter. Default: TAB. (Single byte UTF-8 characters only.)",
232                 &delim,
233 
234                 std.getopt.config.caseSensitive,
235                 "V|version",
236                 "              Print version information and exit.",
237                 &versionWanted,
238                 std.getopt.config.caseInsensitive,
239                 );
240 
241             if (r.helpWanted)
242             {
243                 defaultGetoptPrinter(helpText, r.options);
244                 return tuple(false, 0);
245             }
246             else if (helpVerbose)
247             {
248                 defaultGetoptPrinter(helpTextVerbose, r.options);
249                 return tuple(false, 0);
250             }
251             else if (helpFields)
252             {
253                 writeln(fieldListHelpText);
254                 return tuple(false, 0);
255             }
256             else if (versionWanted)
257             {
258                 import tsv_utils.common.tsvutils_version;
259                 writeln(tsvutilsVersionNotice("tsv-select"));
260                 return tuple(false, 0);
261             }
262 
263             /* Remaining command line args are files. Use standard input if files
264              * were not provided. Truncate cmdArgs to consume the arguments.
265              */
266             string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
267             cmdArgs.length = 1;
268 
269             /* Validation and derivations - Do as much validation prior to header line
270              * processing as possible (avoids waiting on stdin).
271              *
272              * Note: fields and excludedFields depend on header line processing, but
273              * fieldsArg and excludedFieldsArg can be used to detect whether the
274              * command line argument was specified.
275              */
276 
277             enforce(!fieldsArg.empty || !excludedFieldsArg.empty,
278                     "One of '--f|fields' or '--e|exclude' is required.");
279 
280             string[] headerFields;
281 
282             /* fieldListArgProcessing encapsulates the field list processing. It is
283              * called prior to reading the header line if headers are not being used,
284              * and after if headers are being used.
285              */
286             void fieldListArgProcessing()
287             {
288                 if (!fieldsArg.empty)
289                 {
290                     fields = fieldsArg
291                         .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(
292                             hasHeader, headerFields, fieldsOptionString)
293                         .array;
294                 }
295 
296                 size_t[] excludedFields;
297 
298                 if (!excludedFieldsArg.empty)
299                 {
300                     excludedFields = excludedFieldsArg
301                         .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(
302                             hasHeader, headerFields, excludedFieldsOptionString)
303                         .array;
304                 }
305 
306                 if (excludedFields.length > 0)
307                 {
308                     /* Make sure selected and excluded fields do not overlap. */
309                     foreach (e; excludedFields)
310                     {
311                         foreach (f; fields)
312                         {
313                             enforce(e != f, "'--f|fields' and '--e|exclude' have overlapping fields.");
314                         }
315                     }
316 
317                     /* '--exclude' changes '--rest' default to 'last'. */
318                     if (restArg == RestOption.none) restArg = RestOption.last;
319 
320                     /* Build the excluded field lookup table.
321                      *
322                      * Note: Users won't have any reason to expect memory is allocated based
323                      * on the max field number. However, users might pick arbitrarily large
324                      * numbers when trimming fields. So, limit the max field number to something
325                      * big but reasonable (more than 1 million). The limit can be raised if use
326                      * cases arise.
327                      */
328                     size_t maxExcludedField = excludedFields.maxElement;
329                     size_t maxAllowedExcludedField = 1024 * 1024;
330 
331                     enforce(maxExcludedField < maxAllowedExcludedField,
332                             format("Maximum allowed '--e|exclude' field number is %d.",
333                                    maxAllowedExcludedField));
334 
335                     excludedFieldsTable.length = maxExcludedField + 1;          // Initialized to false
336                     foreach (e; excludedFields) excludedFieldsTable[e] = true;
337                 }
338             }
339 
340             if (!hasHeader) fieldListArgProcessing();
341 
342             /*
343              * Create the byLineSourceRange and perform header line processing.
344              */
345             inputSources = byLineSourceRange(filepaths);
346 
347             if (hasHeader)
348             {
349                 if (!inputSources.front.byLine.empty)
350                 {
351                     throwIfWindowsNewline(inputSources.front.byLine.front, inputSources.front.name, 1);
352                     headerFields = inputSources.front.byLine.front.split(delim).to!(string[]);
353                 }
354 
355                 fieldListArgProcessing();
356             }
357 
358         }
359         catch (Exception exc)
360         {
361             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
362             return tuple(false, 1);
363         }
364         return tuple(true, 0);
365     }
366 }
367 
368 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
369 
370 /** Main program.
371  */
372 int main(string[] cmdArgs)
373 {
374     /* When running in DMD code coverage mode, turn on report merging. */
375     version(D_Coverage) version(DigitalMars)
376     {
377         import core.runtime : dmd_coverSetMerge;
378         dmd_coverSetMerge(true);
379     }
380 
381     TsvSelectOptions cmdopt;
382     const r = cmdopt.processArgs(cmdArgs);
383     if (!r[0]) return r[1];
384     version(LDC_Profile)
385     {
386         import ldc.profile : resetAll;
387         resetAll();
388     }
389     try
390     {
391         /* Invoke the tsvSelect template matching the --rest option chosen. Option args
392          * are removed by command line processing (getopt). The program name and any files
393          * remain. Pass the files to tsvSelect.
394          */
395         final switch (cmdopt.restArg)
396         {
397         case TsvSelectOptions.RestOption.none:
398             tsvSelect!(RestLocation.none)(cmdopt);
399             break;
400         case TsvSelectOptions.RestOption.first:
401             tsvSelect!(RestLocation.first)(cmdopt);
402             break;
403         case TsvSelectOptions.RestOption.last:
404             tsvSelect!(RestLocation.last)(cmdopt);
405             break;
406         }
407     }
408     catch (Exception exc)
409     {
410         stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
411         return 1;
412     }
413 
414     return 0;
415 }
416 
417 // tsvSelect
418 
419 /** Enumeration of the different specializations of the tsvSelect template.
420  *
421  * RestLocation is logically equivalent to the TsvSelectOptions.RestOption enum. It
422  * is used by main to choose the appropriate tsvSelect template instantiation to call. It
423  * is distinct from the TsvSelectOptions enum to separate it from the end-user UI. The
424  * TsvSelectOptions version specifies the text of allowed values in command line arguments.
425  */
426 enum RestLocation { none, first, last };
427 
428 /** tsvSelect does the primary work of the tsv-select program.
429  *
430  * Input is read line by line, extracting the listed fields and writing them out in the order
431  * specified. An exception is thrown on error.
432  *
433  * This function is templatized with instantiations for the different --rest options. This
434  * avoids repeatedly running the same if-tests inside the inner loop. The main function
435  * instantiates this function three times, once for each of the --rest options. It results
436  * in a larger program, but is faster. Run-time improvements of 25% were measured compared
437  * to the non-templatized version. (Note: 'cte' stands for 'compile time evaluation'.)
438  */
439 
440 void tsvSelect(RestLocation rest)(ref TsvSelectOptions cmdopt)
441 {
442     import tsv_utils.common.utils: BufferedOutputRange, ByLineSourceRange,
443         InputFieldReordering, throwIfWindowsNewline;
444     import std.algorithm: splitter;
445     import std.array : appender, Appender;
446     import std.format: format;
447     import std.range;
448 
449     // Ensure the correct template instantiation was called.
450     static if (rest == RestLocation.none)
451         assert(cmdopt.restArg == TsvSelectOptions.RestOption.none);
452     else static if (rest == RestLocation.first)
453         assert(cmdopt.restArg == TsvSelectOptions.RestOption.first);
454     else static if (rest == RestLocation.last)
455         assert(cmdopt.restArg == TsvSelectOptions.RestOption.last);
456     else
457         static assert(false, "rest template argument does not match cmdopt.restArg.");
458 
459     /* Check that the input files were setup as expected. Should at least have one
460      * input, stdin if nothing else, and newlines removed from the byLine range.
461      */
462     assert(!cmdopt.inputSources.empty);
463     static assert(is(typeof(cmdopt.inputSources) == ByLineSourceRange!(No.keepTerminator)));
464 
465     /* The algorithm here assumes RestOption.none is not used with --exclude-fields. */
466     assert(cmdopt.excludedFieldsTable.length == 0 || rest != RestLocation.none);
467 
468     /* InputFieldReordering copies select fields from an input line to a new buffer.
469      * The buffer is reordered in the process.
470      */
471     auto fieldReordering = new InputFieldReordering!char(cmdopt.fields);
472 
473     /* Fields not on the --fields list are added to a separate buffer so they can be
474      * output as a group (the --rest option). This is done using an 'Appender', which
475      * is faster than the ~= operator. The Appender is passed a GC allocated buffer
476      * that grows as needed and is reused for each line. Typically it'll grow only
477      * on the first line.
478      */
479     static if (rest != RestLocation.none)
480     {
481         auto leftOverFieldsAppender = appender!(char[][]);
482     }
483 
484     /* BufferedOutputRange (from common/utils.d) is a performance improvement over
485      * writing directly to stdout.
486      */
487     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
488 
489     /* Read each input file (or stdin) and iterate over each line.
490      */
491     foreach (fileNum, inputStream; cmdopt.inputSources.enumerate)
492     {
493         foreach (lineNum, line; inputStream.byLine.enumerate(1))
494         {
495             if (lineNum == 1) throwIfWindowsNewline(line, inputStream.name, lineNum);
496 
497             if (lineNum == 1 && fileNum > 0 && cmdopt.hasHeader)
498             {
499                 continue;   // Drop the header line from all but the first file.
500             }
501 
502             static if (rest != RestLocation.none)
503             {
504                 leftOverFieldsAppender.clear;
505 
506                 /* Track the field location in the line. This enables bulk appending
507                  * after the last specified field has been processed.
508                  */
509                 size_t nextFieldStart = 0;
510             }
511 
512             fieldReordering.initNewLine;
513 
514             foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
515             {
516                 static if (rest == RestLocation.none)
517                 {
518                     fieldReordering.processNextField(fieldIndex, fieldValue);
519                     if (fieldReordering.allFieldsFilled) break;
520                 }
521                 else
522                 {
523                     /* Processing with 'rest' fields. States:
524                      *  - Excluded fields and specified fields remain
525                      *  - Only specified fields remain
526                      *  - Only excluded fields remain
527                      */
528 
529                     nextFieldStart += fieldValue.length + 1;
530                     bool excludedFieldsRemain = fieldIndex < cmdopt.excludedFieldsTable.length;
531                     immutable isExcluded = excludedFieldsRemain && cmdopt.excludedFieldsTable[fieldIndex];
532 
533                     if (!isExcluded)
534                     {
535                         immutable numMatched = fieldReordering.processNextField(fieldIndex, fieldValue);
536 
537                         if (numMatched == 0) leftOverFieldsAppender.put(fieldValue);
538                     }
539                     else if (fieldIndex + 1 == cmdopt.excludedFieldsTable.length)
540                     {
541                         excludedFieldsRemain = false;
542                     }
543 
544                     if (fieldReordering.allFieldsFilled && !excludedFieldsRemain)
545                     {
546                         /* Processed all specified fields. Bulk append any fields
547                          * remaining on the line. Cases:
548                          * - Current field is last field:
549                          */
550                         if (nextFieldStart <= line.length)
551                         {
552                             leftOverFieldsAppender.put(line[nextFieldStart .. $]);
553                         }
554 
555                         break;
556                     }
557                 }
558             }
559 
560             // Finished with all fields in the line.
561             enforce(fieldReordering.allFieldsFilled,
562                     format("Not enough fields in line. File: %s,  Line: %s",
563                            inputStream.name, lineNum));
564 
565             // Write the re-ordered line.
566 
567             static if (rest == RestLocation.first)
568             {
569                 if (leftOverFieldsAppender.data.length > 0)
570                 {
571                     bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim);
572                     if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim);
573                 }
574             }
575 
576             bufferedOutput.joinAppend(fieldReordering.outputFields, cmdopt.delim);
577 
578             static if (rest == RestLocation.last)
579             {
580                 if (leftOverFieldsAppender.data.length > 0)
581                 {
582                     if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim);
583                     bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim);
584                 }
585             }
586 
587             bufferedOutput.appendln;
588 
589             /* Send the first line of the first file immediately. This helps detect
590              * errors quickly in multi-stage unix pipelines. Note that tsv-select may
591              * have been sent one line from an upstream process, usually a header line.
592              */
593             if (lineNum == 1 && fileNum == 0) bufferedOutput.flush;
594         }
595     }
596 }