1 /**
2 A variant of the unix 'cut' program, with the ability to reorder fields.
3 
4 tsv-select is a variation on the Unix 'cut' utility, with the added ability to reorder
5 fields. Lines are read from files or standard input and split on a delimiter character.
6 Fields are written to standard output in the order listed. Fields can be listed more
7 than once, and fields not listed can be written out as a group.
8 
9 This program is intended both as a useful utility and a D programming language example.
10 Functionality and constructs used include command line argument processing, file I/O,
11 exception handling, ranges, tuples and strings, templates, universal function call syntax
12 (UFCS), lambdas and functional programming constructs. Comments are more verbose than
13 typical to shed light on D programming constructs, but not to the level of a tutorial.
14 
15 Copyright (c) 2015-2020, eBay Inc.
16 Initially written by Jon Degenhardt
17 
18 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
19 */
20 
21 module tsv_utils.tsv_select;   // Module name defaults to file name, but hyphens not allowed, so set it here.
22 
23 // Imports used by multiple routines. Others imports made in local context.
24 import std.exception : enforce;
25 import std.stdio;
26 import std.typecons : tuple, Tuple;
27 
28 // 'Heredoc' style help text. When printed it is followed by a getopt formatted option list.
29 immutable helpText = q"EOS
30 Synopsis: tsv-select [options] [file...]
31 
32 tsv-select reads files or standard input and writes selected fields to
33 standard output. Fields are written in the order listed. This is similar
34 to Unix 'cut', but with the ability to reorder fields.
35 
36 Fields numbers start with one. They are comma separated and ranges can be
37 used. Fields can be repeated, and fields not included in the '--f|fields'
38 option can be selected as a group using '--r|rest'. Fields can be dropped
39 using '--e|exclude'. Multiple files with header lines can be managed with
40 '--H|header', which retains the header of the first file only.
41 
42 Examples:
43 
44    # Output fields 2 and 1, in that order
45    tsv-select -f 2,1 data.tsv
46 
47    # Drop the first field, keep everything else.
48    tsv-select --exclude 1 file.tsv
49 
50    # Move the first field to the end
51    tsv-select -f 1 --rest first data.tsv
52 
53    # Multiple files with header lines. Keep only one header.
54    tsv-select data*.tsv -H --fields 1,2,4-7,14
55 
56 Use '--help-verbose' for detailed information.
57 
58 Options:
59 EOS";
60 
61 immutable helpTextVerbose = q"EOS
62 Synopsis: tsv-select [options] [file...]
63 
64 tsv-select reads files or standard input and writes selected fields to
65 standard output. Fields are written in the order listed. This is similar
66 to Unix 'cut', but with the ability to reorder fields.
67 
68 Fields numbers start with one. They are comma separated and ranges can be
69 used. Fields can be repeated, and fields not included in the '--f|fields'
70 option can be selected as a group using '--r|rest'. Use '--H|header' to
71 retain the header line from only the first file.
72 
73 Fields can be excluded using '--e|exclude'. All fields not excluded are
74 output. '--f|fields' and '--r|rest' can be used with '--e|exclude' to
75 reorder non-excluded fields.
76 
77 Examples:
78 
79    # Keep the first field from two files
80    tsv-select -f 1 file1.tsv file2.tsv
81 
82    # Keep fields 1 and 2, retain the header from the first file
83    tsv-select -H -f 1,2 file1.tsv file2.tsv
84 
85    # Field reordering and field ranges
86    tsv-select -f 3,2,1 file.tsv
87    tsv-select -f 1,4-7,11 file.tsv
88    tsv-select -f 1,7-4,11 file.tsv
89 
90    # Repeating fields
91    tsv-select -f 1,2,1 file.tsv
92    tsv-select -f 1-3,3-1 file.tsv
93 
94    # Move field 5 to the front
95    tsv-select -f 5 --rest last file.tsv
96 
97    # Move fields 4 and 5 to the end
98    tsv-select -f 4,5 --rest first file.tsv
99 
100    # Drop the first field, keep everything else
101    tsv-select --exclude 1 file.tsv
102 
103    # Move field 2 to the front and drop fields 10-15
104    tsv-select -f 2 -e 10-15 file.tsv
105 
106    # Move field 2 to the end, dropping fields 10-15
107    tsv-select -f 2 -rest first -e 10-15 file.tsv
108 
109 Notes:
110 * One of '--f|fields' or '--e|exclude' is required.
111 * Fields specified by '--f|fields' and '--e|exclude' cannot overlap.
112 * When '--f|fields' and '--e|exclude' are used together, the effect is to
113   specify '--rest last'. This can be overridden by using '--rest first'.
114 * Each input line must be long enough to contain all fields specified with
115   '--f|fields'. This is not necessary for '--e|exclude' fields.
116 
117 Options:
118 EOS";
119 
120 /** Container for command line options.
121  */
122 struct TsvSelectOptions
123 {
124     import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange;
125 
126     // The allowed values for the --rest option.
127     enum RestOption { none, first, last};
128 
129     string programName;                 /// Program name
130     ByLineSourceRange!() inputSources;  /// Input Files
131     bool helpVerbose = false;           /// --help-verbose
132     bool hasHeader = false;             /// --H|header
133     char delim = '\t';                  /// --d|delimiter
134     size_t[] fields;                    /// --f|fields
135     size_t[] excludedFieldsArg;         /// --e|exclude
136     RestOption restArg;                 /// --rest first|last (none is hidden default)
137     bool versionWanted = false;         /// --V|version
138     bool[] excludedFieldsTable;         /// Derived. Lookup table for excluded fields.
139 
140     /** Process command line arguments (getopt cover).
141      *
142      * processArgs calls getopt to process command line arguments. It does any additional
143      * validation and parameter derivations needed. A tuple is returned. First value is
144      * true if command line arguments were successfully processed and execution should
145      * continue, or false if an error occurred or the user asked for help. If false, the
146      * second value is the appropriate exit code (0 or 1).
147      *
148      * Returning true (execution continues) means args have been validated and derived
149      * values calculated. In addition, field indices have been converted to zero-based.
150      */
151     auto processArgs (ref string[] cmdArgs)
152     {
153         import std.algorithm : any, each, maxElement;
154         import std.format : format;
155         import std.getopt;
156         import std.path : baseName, stripExtension;
157         import std.typecons : Yes, No;
158         import tsv_utils.common.utils :  makeFieldListOptionHandler;
159 
160         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
161 
162         try
163         {
164             arraySep = ",";    // Use comma to separate values in command line options
165             auto r = getopt(
166                 cmdArgs,
167                 "help-verbose",    "     Print more detailed help.", &helpVerbose,
168 
169                 std.getopt.config.caseSensitive,
170                 "H|header",    "              Treat the first line of each file as a header.", &hasHeader,
171                 std.getopt.config.caseInsensitive,
172 
173                 "f|fields",    "<field-list>  Fields to retain. Fields are output in the order listed.",
174                 fields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex),
175 
176                 "e|exclude",   "<field-list>  Fields to exclude.",
177                 excludedFieldsArg.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex),
178 
179                 "r|rest",      "first|last    Output location for fields not included in '--f|fields'.", &restArg,
180                 "d|delimiter", "CHR           Character to use as field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
181                 std.getopt.config.caseSensitive,
182                 "V|version",   "              Print version information and exit.", &versionWanted,
183                 std.getopt.config.caseInsensitive,
184                 );
185 
186             if (r.helpWanted)
187             {
188                 defaultGetoptPrinter(helpText, r.options);
189                 return tuple(false, 0);
190             }
191             else if (helpVerbose)
192             {
193                 defaultGetoptPrinter(helpTextVerbose, r.options);
194                 return tuple(false, 0);
195             }
196             else if (versionWanted)
197             {
198                 import tsv_utils.common.tsvutils_version;
199                 writeln(tsvutilsVersionNotice("tsv-select"));
200                 return tuple(false, 0);
201             }
202 
203             /*
204              * Consistency checks and derivations.
205              */
206 
207             enforce(fields.length != 0 || excludedFieldsArg.length != 0,
208                     "One of '--f|fields' or '--e|exclude' is required.");
209 
210             /* Remaining command line args are files. Use standard input if files
211              * were not provided. Truncate cmdArgs to consume the arguments.
212              */
213             string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
214             cmdArgs.length = 1;
215             inputSources = byLineSourceRange(filepaths);
216 
217             if (excludedFieldsArg.length > 0)
218             {
219                 /* Make sure selected and excluded fields do not overlap. */
220                 foreach (e; excludedFieldsArg)
221                 {
222                     foreach (f; fields)
223                     {
224                         enforce(e != f, "'--f|fields' and '--e|exclude' have overlapping fields.");
225                     }
226                 }
227 
228                 /* '--exclude' changes '--rest' default to 'last'. */
229                 if (restArg == RestOption.none) restArg = RestOption.last;
230 
231                 /* Build the excluded field lookup table.
232                  *
233                  * Note: Users won't have any reason to expect memory is allocated based
234                  * on the max field number. However, users might pick arbitrarily large
235                  * numbers when trimming fields. So, limit the max field number to something
236                  * big but reasonable (more than 1 million). The limit can be raised if use
237                  * cases arise.
238                  */
239                 size_t maxExcludedField = excludedFieldsArg.maxElement;
240                 size_t maxAllowedExcludedField = 1024 * 1024;
241 
242                 enforce(maxExcludedField < maxAllowedExcludedField,
243                         format("Maximum allowed '--e|exclude' field number is %d.",
244                                maxAllowedExcludedField));
245 
246                 excludedFieldsTable.length = maxExcludedField + 1;          // Initialized to false
247                 foreach (e; excludedFieldsArg) excludedFieldsTable[e] = true;
248             }
249         }
250         catch (Exception exc)
251         {
252             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
253             return tuple(false, 1);
254         }
255         return tuple(true, 0);
256     }
257 }
258 
259 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
260 
261 /** Main program.
262  */
263 int main(string[] cmdArgs)
264 {
265     /* When running in DMD code coverage mode, turn on report merging. */
266     version(D_Coverage) version(DigitalMars)
267     {
268         import core.runtime : dmd_coverSetMerge;
269         dmd_coverSetMerge(true);
270     }
271 
272     TsvSelectOptions cmdopt;
273     const r = cmdopt.processArgs(cmdArgs);
274     if (!r[0]) return r[1];
275     version(LDC_Profile)
276     {
277         import ldc.profile : resetAll;
278         resetAll();
279     }
280     try
281     {
282         /* Invoke the tsvSelect template matching the --rest option chosen. Option args
283          * are removed by command line processing (getopt). The program name and any files
284          * remain. Pass the files to tsvSelect.
285          */
286         final switch (cmdopt.restArg)
287         {
288         case TsvSelectOptions.RestOption.none:
289             tsvSelect!(RestLocation.none)(cmdopt);
290             break;
291         case TsvSelectOptions.RestOption.first:
292             tsvSelect!(RestLocation.first)(cmdopt);
293             break;
294         case TsvSelectOptions.RestOption.last:
295             tsvSelect!(RestLocation.last)(cmdopt);
296             break;
297         }
298     }
299     catch (Exception exc)
300     {
301         stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
302         return 1;
303     }
304 
305     return 0;
306 }
307 
308 // tsvSelect
309 
310 /** Enumeration of the different specializations of the tsvSelect template.
311  *
312  * RestLocation is logically equivalent to the TsvSelectOptions.RestOption enum. It
313  * is used by main to choose the appropriate tsvSelect template instantiation to call. It
314  * is distinct from the TsvSelectOptions enum to separate it from the end-user UI. The
315  * TsvSelectOptions version specifies the text of allowed values in command line arguments.
316  */
317 enum RestLocation { none, first, last };
318 
319 /** tsvSelect does the primary work of the tsv-select program.
320  *
321  * Input is read line by line, extracting the listed fields and writing them out in the order
322  * specified. An exception is thrown on error.
323  *
324  * This function is templatized with instantiations for the different --rest options. This
325  * avoids repeatedly running the same if-tests inside the inner loop. The main function
326  * instantiates this function three times, once for each of the --rest options. It results
327  * in a larger program, but is faster. Run-time improvements of 25% were measured compared
328  * to the non-templatized version. (Note: 'cte' stands for 'compile time evaluation'.)
329  */
330 
331 void tsvSelect(RestLocation rest)(ref TsvSelectOptions cmdopt)
332 {
333     import tsv_utils.common.utils: BufferedOutputRange, ByLineSourceRange,
334         InputFieldReordering, throwIfWindowsNewlineOnUnix;
335     import std.algorithm: splitter;
336     import std.array : appender, Appender;
337     import std.format: format;
338     import std.range;
339 
340     // Ensure the correct template instantiation was called.
341     static if (rest == RestLocation.none)
342         assert(cmdopt.restArg == TsvSelectOptions.RestOption.none);
343     else static if (rest == RestLocation.first)
344         assert(cmdopt.restArg == TsvSelectOptions.RestOption.first);
345     else static if (rest == RestLocation.last)
346         assert(cmdopt.restArg == TsvSelectOptions.RestOption.last);
347     else
348         static assert(false, "rest template argument does not match cmdopt.restArg.");
349 
350     /* Check that the input files were setup as expected. Should at least have one
351      * input, stdin if nothing else, and newlines removed from the byLine range.
352      */
353     assert(!cmdopt.inputSources.empty);
354     static assert(is(typeof(cmdopt.inputSources) == ByLineSourceRange!(No.keepTerminator)));
355 
356     /* The algorithm here assumes RestOption.none is not used with --exclude-fields. */
357     assert(cmdopt.excludedFieldsTable.length == 0 || rest != RestLocation.none);
358 
359     /* InputFieldReordering copies select fields from an input line to a new buffer.
360      * The buffer is reordered in the process.
361      */
362     auto fieldReordering = new InputFieldReordering!char(cmdopt.fields);
363 
364     /* Fields not on the --fields list are added to a separate buffer so they can be
365      * output as a group (the --rest option). This is done using an 'Appender', which
366      * is faster than the ~= operator. The Appender is passed a GC allocated buffer
367      * that grows as needed and is reused for each line. Typically it'll grow only
368      * on the first line.
369      */
370     static if (rest != RestLocation.none)
371     {
372         auto leftOverFieldsAppender = appender!(char[][]);
373     }
374 
375     /* BufferedOutputRange (from tsvutils.d) is a performance improvement over writing
376      * directly to stdout.
377      */
378     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
379 
380     /* Read each input file (or stdin) and iterate over each line.
381      */
382     foreach (fileNum, inputStream; cmdopt.inputSources.enumerate)
383     {
384         foreach (lineNum, line; inputStream.byLine.enumerate(1))
385         {
386             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum);
387 
388             if (lineNum == 1 && fileNum > 0 && cmdopt.hasHeader)
389             {
390                 continue;   // Drop the header line from all but the first file.
391             }
392 
393             static if (rest != RestLocation.none)
394             {
395                 leftOverFieldsAppender.clear;
396 
397                 /* Track the field location in the line. This enables bulk appending
398                  * after the last specified field has been processed.
399                  */
400                 size_t nextFieldStart = 0;
401             }
402 
403             fieldReordering.initNewLine;
404 
405             foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
406             {
407                 static if (rest == RestLocation.none)
408                 {
409                     fieldReordering.processNextField(fieldIndex, fieldValue);
410                     if (fieldReordering.allFieldsFilled) break;
411                 }
412                 else
413                 {
414                     /* Processing with 'rest' fields. States:
415                      *  - Excluded fields and specified fields remain
416                      *  - Only specified fields remain
417                      *  - Only excluded fields remain
418                      */
419 
420                     nextFieldStart += fieldValue.length + 1;
421                     bool excludedFieldsRemain = fieldIndex < cmdopt.excludedFieldsTable.length;
422                     immutable isExcluded = excludedFieldsRemain && cmdopt.excludedFieldsTable[fieldIndex];
423 
424                     if (!isExcluded)
425                     {
426                         immutable numMatched = fieldReordering.processNextField(fieldIndex, fieldValue);
427 
428                         if (numMatched == 0) leftOverFieldsAppender.put(fieldValue);
429                     }
430                     else if (fieldIndex + 1 == cmdopt.excludedFieldsTable.length)
431                     {
432                         excludedFieldsRemain = false;
433                     }
434 
435                     if (fieldReordering.allFieldsFilled && !excludedFieldsRemain)
436                     {
437                         /* Processed all specified fields. Bulk append any fields
438                          * remaining on the line. Cases:
439                          * - Current field is last field:
440                          */
441                         if (nextFieldStart <= line.length)
442                         {
443                             leftOverFieldsAppender.put(line[nextFieldStart .. $]);
444                         }
445 
446                         break;
447                     }
448                 }
449             }
450 
451             // Finished with all fields in the line.
452             enforce(fieldReordering.allFieldsFilled,
453                     format("Not enough fields in line. File: %s,  Line: %s",
454                            inputStream.name, lineNum));
455 
456             // Write the re-ordered line.
457 
458             static if (rest == RestLocation.first)
459             {
460                 if (leftOverFieldsAppender.data.length > 0)
461                 {
462                     bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim);
463                     if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim);
464                 }
465             }
466 
467             bufferedOutput.joinAppend(fieldReordering.outputFields, cmdopt.delim);
468 
469             static if (rest == RestLocation.last)
470             {
471                 if (leftOverFieldsAppender.data.length > 0)
472                 {
473                     if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim);
474                     bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim);
475                 }
476             }
477 
478             bufferedOutput.appendln;
479         }
480     }
481 }