1 /**
2 A variant of the unix 'cut' program, with the ability to reorder fields.
3 
4 tsv-select is a variation on the Unix 'cut' utility, with the added ability to reorder
5 fields. Lines are read from files or standard input and split on a delimiter character.
6 Fields are written to standard output in the order listed. Fields can be listed more
7 than once, and fields not listed can be written out as a group.
8 
9 This program is intended both as a useful utility and a D programming language example.
10 Functionality and constructs used include command line argument processing, file I/O,
11 exception handling, ranges, tuples and strings, templates, universal function call syntax
12 (UFCS), lambdas and functional programming constructs. Comments are more verbose than
13 typical to shed light on D programming constructs, but not to the level of a tutorial.
14 
15 Copyright (c) 2015-2020, eBay Inc.
16 Initially written by Jon Degenhardt
17 
18 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
19 */
20 
21 module tsv_utils.tsv_select;   // Module name defaults to file name, but hyphens not allowed, so set it here.
22 
23 // Imports used by multiple routines. Others imports made in local context.
24 import std.stdio;
25 import std.typecons : tuple, Tuple;
26 
27 // 'Heredoc' style help text. When printed it is followed by a getopt formatted option list.
28 immutable helpText = q"EOS
29 Synopsis: tsv-select [options] [file...]
30 
31 tsv-select reads files or standard input and writes selected fields to
32 standard output. Fields are written in the order listed. This is similar
33 to Unix 'cut', but with the ability to reorder fields.
34 
35 Fields numbers start with one. They are comma separated and ranges can be
36 used. Fields can be repeated, and fields not included in the '--f|fields'
37 option can be selected as a group using '--r|rest'. Fields can be dropped
38 using '--e|exclude'. Multiple files with header lines can be managed with
39 '--H|header', which retains the header of the first file only.
40 
41 Examples:
42 
43    # Output fields 2 and 1, in that order
44    tsv-select -f 2,1 data.tsv
45 
46    # Drop the first field, keep everything else.
47    tsv-select --exclude 1 file.tsv
48 
49    # Move the first field to the end
50    tsv-select -f 1 --rest first data.tsv
51 
52    # Multiple files with header lines. Keep only one header.
53    tsv-select data*.tsv -H --fields 1,2,4-7,14
54 
55 Use '--help-verbose' for detailed information.
56 
57 Options:
58 EOS";
59 
60 immutable helpTextVerbose = q"EOS
61 Synopsis: tsv-select [options] [file...]
62 
63 tsv-select reads files or standard input and writes selected fields to
64 standard output. Fields are written in the order listed. This is similar
65 to Unix 'cut', but with the ability to reorder fields.
66 
67 Fields numbers start with one. They are comma separated and ranges can be
68 used. Fields can be repeated, and fields not included in the '--f|fields'
69 option can be selected as a group using '--r|rest'. Use '--H|header' to
70 retain the header line from only the first file.
71 
72 Fields can be excluded using '--e|exclude'. All fields not excluded are
73 output. '--f|fields' and '--r|rest' can be used with '--e|exclude' to
74 reorder non-excluded fields.
75 
76 Examples:
77 
78    # Keep the first field from two files
79    tsv-select -f 1 file1.tsv file2.tsv
80 
81    # Keep fields 1 and 2, retain the header from the first file
82    tsv-select -H -f 1,2 file1.tsv file2.tsv
83 
84    # Field reordering and field ranges
85    tsv-select -f 3,2,1 file.tsv
86    tsv-select -f 1,4-7,11 file.tsv
87    tsv-select -f 1,7-4,11 file.tsv
88 
89    # Repeating fields
90    tsv-select -f 1,2,1 file.tsv
91    tsv-select -f 1-3,3-1 file.tsv
92 
93    # Move field 5 to the front
94    tsv-select -f 5 --rest last file.tsv
95 
96    # Move fields 4 and 5 to the end
97    tsv-select -f 4,5 --rest first file.tsv
98 
99    # Drop the first field, keep everything else
100    tsv-select --exclude 1 file.tsv
101 
102    # Move field 2 to the front and drop fields 10-15
103    tsv-select -f 2 -e 10-15 file.tsv
104 
105    # Move field 2 to the end, dropping fields 10-15
106    tsv-select -f 2 -rest first -e 10-15 file.tsv
107 
108 Notes:
109 * One of '--f|fields' or '--e|exclude' is required.
110 * Fields specified by '--f|fields' and '--e|exclude' cannot overlap.
111 * When '--f|fields' and '--e|exclude' are used together, the effect is to
112   specify '--rest last'. This can be overridden by using '--rest first'.
113 * Each input line must be long enough to contain all fields specified with
114   '--f|fields'. This is not necessary for '--e|exclude' fields.
115 
116 Options:
117 EOS";
118 
119 /** Container for command line options.
120  */
121 struct TsvSelectOptions
122 {
123     // The allowed values for the --rest option.
124     enum RestOption { none, first, last};
125 
126     string programName;           /// Program name
127     bool helpVerbose = false;     /// --help-verbose
128     bool hasHeader = false;       /// --H|header
129     char delim = '\t';            /// --d|delimiter
130     size_t[] fields;              /// --f|fields
131     size_t[] excludedFieldsArg;   /// --e|exclude
132     RestOption restArg;           /// --rest first|last (none is hidden default)
133     bool versionWanted = false;   /// --V|version
134     bool[] excludedFieldsTable;   /// Derived. Lookup table for excluded fields.
135 
136     /** Process command line arguments (getopt cover).
137      *
138      * processArgs calls getopt to process command line arguments. It does any additional
139      * validation and parameter derivations needed. A tuple is returned. First value is
140      * true if command line arguments were successfully processed and execution should
141      * continue, or false if an error occurred or the user asked for help. If false, the
142      * second value is the appropriate exit code (0 or 1).
143      *
144      * Returning true (execution continues) means args have been validated and derived
145      * values calculated. In addition, field indices have been converted to zero-based.
146      */
147     auto processArgs (ref string[] cmdArgs)
148     {
149         import std.algorithm : any, each, maxElement;
150         import std.format : format;
151         import std.getopt;
152         import std.path : baseName, stripExtension;
153         import std.typecons : Yes, No;
154         import tsv_utils.common.utils :  makeFieldListOptionHandler;
155 
156         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
157 
158         try
159         {
160             arraySep = ",";    // Use comma to separate values in command line options
161             auto r = getopt(
162                 cmdArgs,
163                 "help-verbose",    "     Print more detailed help.", &helpVerbose,
164 
165                 std.getopt.config.caseSensitive,
166                 "H|header",    "              Treat the first line of each file as a header.", &hasHeader,
167                 std.getopt.config.caseInsensitive,
168 
169                 "f|fields",    "<field-list>  Fields to retain. Fields are output in the order listed.",
170                 fields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex),
171 
172                 "e|exclude",   "<field-list>  Fields to exclude.",
173                 excludedFieldsArg.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex),
174 
175                 "r|rest",      "first|last    Output location for fields not included in '--f|fields'.", &restArg,
176                 "d|delimiter", "CHR           Character to use as field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
177                 std.getopt.config.caseSensitive,
178                 "V|version",   "              Print version information and exit.", &versionWanted,
179                 std.getopt.config.caseInsensitive,
180                 );
181 
182             if (r.helpWanted)
183             {
184                 defaultGetoptPrinter(helpText, r.options);
185                 return tuple(false, 0);
186             }
187             else if (helpVerbose)
188             {
189                 defaultGetoptPrinter(helpTextVerbose, r.options);
190                 return tuple(false, 0);
191             }
192             else if (versionWanted)
193             {
194                 import tsv_utils.common.tsvutils_version;
195                 writeln(tsvutilsVersionNotice("tsv-select"));
196                 return tuple(false, 0);
197             }
198 
199             /*
200              * Consistency checks and derivations.
201              */
202 
203             if (fields.length == 0 && excludedFieldsArg.length == 0)
204             {
205                 throw new Exception("One of '--f|fields' or '--e|exclude' is required.");
206             }
207 
208             if (excludedFieldsArg.length > 0)
209             {
210                 /* Make sure selected and excluded fields do not overlap. */
211                 foreach (e; excludedFieldsArg)
212                 {
213                     foreach (f; fields)
214                     {
215                         if (e == f)
216                         {
217                             throw new Exception("'--f|fields' and '--e|exclude' have overlapping fields.");
218                         }
219                     }
220                 }
221 
222                 /* '--exclude' changes '--rest' default to 'last'. */
223                 if (restArg == RestOption.none) restArg = RestOption.last;
224 
225                 /* Build the excluded field lookup table.
226                  *
227                  * Note: Users won't have any reason to expect memory is allocated based
228                  * on the max field number. However, users might pick arbitrarily large
229                  * numbers when trimming fields. So, limit the max field number to something
230                  * big but reasonable (more than 1 million). The limit can be raised if use
231                  * cases arise.
232                  */
233                 size_t maxExcludedField = excludedFieldsArg.maxElement;
234                 size_t maxAllowedExcludedField = 1024 * 1024;
235 
236                 if (maxExcludedField >= maxAllowedExcludedField)
237                 {
238                     throw new Exception(format("Maximum allowed '--e|exclude' field number is %d.",
239                                                maxAllowedExcludedField));
240                 }
241 
242                 excludedFieldsTable.length = maxExcludedField + 1;          // Initialized to false
243                 foreach (e; excludedFieldsArg) excludedFieldsTable[e] = true;
244             }
245         }
246         catch (Exception exc)
247         {
248             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
249             return tuple(false, 1);
250         }
251         return tuple(true, 0);
252     }
253 }
254 
255 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
256 
257 /** Main program.
258  */
259 int main(string[] cmdArgs)
260 {
261     /* When running in DMD code coverage mode, turn on report merging. */
262     version(D_Coverage) version(DigitalMars)
263     {
264         import core.runtime : dmd_coverSetMerge;
265         dmd_coverSetMerge(true);
266     }
267 
268     TsvSelectOptions cmdopt;
269     const r = cmdopt.processArgs(cmdArgs);
270     if (!r[0]) return r[1];
271     version(LDC_Profile)
272     {
273         import ldc.profile : resetAll;
274         resetAll();
275     }
276     try
277     {
278         /* Invoke the tsvSelect template matching the --rest option chosen. Option args
279          * are removed by command line processing (getopt). The program name and any files
280          * remain. Pass the files to tsvSelect.
281          */
282         final switch (cmdopt.restArg)
283         {
284         case TsvSelectOptions.RestOption.none:
285             tsvSelect!(RestLocation.none)(cmdopt, cmdArgs[1..$]);
286             break;
287         case TsvSelectOptions.RestOption.first:
288             tsvSelect!(RestLocation.first)(cmdopt, cmdArgs[1..$]);
289             break;
290         case TsvSelectOptions.RestOption.last:
291             tsvSelect!(RestLocation.last)(cmdopt, cmdArgs[1..$]);
292             break;
293         }
294     }
295     catch (Exception exc)
296     {
297         stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
298         return 1;
299     }
300 
301     return 0;
302 }
303 
304 // tsvSelect
305 
306 /** Enumeration of the different specializations of the tsvSelect template.
307  *
308  * RestLocation is logically equivalent to the TsvSelectOptions.RestOption enum. It
309  * is used by main to choose the appropriate tsvSelect template instantiation to call. It
310  * is distinct from the TsvSelectOptions enum to separate it from the end-user UI. The
311  * TsvSelectOptions version specifies the text of allowed values in command line arguments.
312  */
313 enum RestLocation { none, first, last };
314 
315 /** tsvSelect does the primary work of the tsv-select program.
316  *
317  * Input is read line by line, extracting the listed fields and writing them out in the order
318  * specified. An exception is thrown on error.
319  *
320  * This function is templatized with instantiations for the different --rest options. This
321  * avoids repeatedly running the same if-tests inside the inner loop. The main function
322  * instantiates this function three times, once for each of the --rest options. It results
323  * in a larger program, but is faster. Run-time improvements of 25% were measured compared
324  * to the non-templatized version. (Note: 'cte' stands for 'compile time evaluation'.)
325  */
326 void tsvSelect(RestLocation rest)(const TsvSelectOptions cmdopt, const string[] inputFiles)
327 {
328     import tsv_utils.common.utils: BufferedOutputRange, bufferedByLine, InputFieldReordering, throwIfWindowsNewlineOnUnix;
329     import std.algorithm: splitter;
330     import std.format: format;
331     import std.range;
332 
333     // Ensure the correct template instantiation was called.
334     static if (rest == RestLocation.none)
335         assert(cmdopt.restArg == TsvSelectOptions.RestOption.none);
336     else static if (rest == RestLocation.first)
337         assert(cmdopt.restArg == TsvSelectOptions.RestOption.first);
338     else static if (rest == RestLocation.last)
339         assert(cmdopt.restArg == TsvSelectOptions.RestOption.last);
340     else
341         static assert(false, "rest template argument does not match cmdopt.restArg.");
342 
343     /* The algorithm here assumes RestOption.none is not used with --exclude-fields. */
344     assert(cmdopt.excludedFieldsTable.length == 0 || rest != RestLocation.none);
345 
346     /* InputFieldReordering copies select fields from an input line to a new buffer.
347      * The buffer is reordered in the process.
348      */
349     auto fieldReordering = new InputFieldReordering!char(cmdopt.fields);
350 
351     /* Fields not on the --fields list are added to a separate buffer so they can be
352      * output as a group (the --rest option). This is done using an 'Appender', which
353      * is faster than the ~= operator. The Appender is passed a GC allocated buffer
354      * that grows as needed and is reused for each line. Typically it'll grow only
355      * on the first line.
356      */
357     static if (rest != RestLocation.none)
358     {
359         auto leftOverFieldsAppender = appender!(char[][]);
360     }
361 
362     /* BufferedOutputRange (from tsvutils.d) is a performance improvement over writing
363      * directly to stdout.
364      */
365     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
366 
367     /* Read each input file (or stdin) and iterate over each line. A filename of "-" is
368      * interpreted as stdin, common behavior for unix command line tools.
369      */
370     foreach (fileNum, filename; (inputFiles.length > 0) ? inputFiles : ["-"])
371     {
372         auto inputStream = (filename == "-") ? stdin : filename.File();
373         foreach (lineNum, line; inputStream.bufferedByLine.enumerate(1))
374         {
375             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum);
376 
377             if (lineNum == 1 && fileNum > 0 && cmdopt.hasHeader)
378             {
379                 continue;   // Drop the header line from all but the first file.
380             }
381 
382             static if (rest != RestLocation.none)
383             {
384                 leftOverFieldsAppender.clear;
385 
386                 /* Track the field location in the line. This enables bulk appending
387                  * after the last specified field has been processed.
388                  */
389                 size_t nextFieldStart = 0;
390             }
391 
392             fieldReordering.initNewLine;
393 
394             foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
395             {
396                 static if (rest == RestLocation.none)
397                 {
398                     fieldReordering.processNextField(fieldIndex, fieldValue);
399                     if (fieldReordering.allFieldsFilled) break;
400                 }
401                 else
402                 {
403                     /* Processing with 'rest' fields. States:
404                      *  - Excluded fields and specified fields remain
405                      *  - Only specified fields remain
406                      *  - Only excluded fields remain
407                      */
408 
409                     nextFieldStart += fieldValue.length + 1;
410                     bool excludedFieldsRemain = fieldIndex < cmdopt.excludedFieldsTable.length;
411                     immutable isExcluded = excludedFieldsRemain && cmdopt.excludedFieldsTable[fieldIndex];
412 
413                     if (!isExcluded)
414                     {
415                         immutable numMatched = fieldReordering.processNextField(fieldIndex, fieldValue);
416 
417                         if (numMatched == 0) leftOverFieldsAppender.put(fieldValue);
418                     }
419                     else if (fieldIndex + 1 == cmdopt.excludedFieldsTable.length)
420                     {
421                         excludedFieldsRemain = false;
422                     }
423 
424                     if (fieldReordering.allFieldsFilled && !excludedFieldsRemain)
425                     {
426                         /* Processed all specified fields. Bulk append any fields
427                          * remaining on the line. Cases:
428                          * - Current field is last field:
429                          */
430                         if (nextFieldStart <= line.length)
431                         {
432                             leftOverFieldsAppender.put(line[nextFieldStart .. $]);
433                         }
434 
435                         break;
436                     }
437                 }
438             }
439 
440             // Finished with all fields in the line.
441             if (!fieldReordering.allFieldsFilled)
442             {
443                 throw new Exception(
444                     format("Not enough fields in line. File: %s,  Line: %s",
445                            (filename == "-") ? "Standard Input" : filename, lineNum));
446             }
447 
448             // Write the re-ordered line.
449 
450             static if (rest == RestLocation.first)
451             {
452                 if (leftOverFieldsAppender.data.length > 0)
453                 {
454                     bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim);
455                     if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim);
456                 }
457             }
458 
459             bufferedOutput.joinAppend(fieldReordering.outputFields, cmdopt.delim);
460 
461             static if (rest == RestLocation.last)
462             {
463                 if (leftOverFieldsAppender.data.length > 0)
464                 {
465                     if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim);
466                     bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim);
467                 }
468             }
469 
470             bufferedOutput.appendln;
471         }
472     }
473 }