1 /**
2 A variant of the unix 'cut' program, with the ability to reorder fields.
3 
4 tsv-select is a variation on the Unix 'cut' utility, with the added ability to reorder
5 fields. Lines are read from files or standard input and split on a delimiter character.
6 Fields are written to standard output in the order listed. Fields can be listed more
7 than once, and fields not listed can be written out as a group.
8 
9 This program is intended both as a useful utility and a D programming language example.
10 Functionality and constructs used include command line argument processing, file I/O,
11 exception handling, ranges, tuples and strings, templates, universal function call syntax
12 (UFCS), lambdas and functional programming constructs. Comments are more verbose than
13 typical to shed light on D programming constructs, but not to the level of a tutorial.
14 
15 Copyright (c) 2015-2018, eBay Software Foundation
16 Initially written by Jon Degenhardt
17 
18 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
19 */
20 
21 module tsv_utils.tsv_select;   // Module name defaults to file name, but hyphens not allowed, so set it here.
22 
23 // Imports used by multiple routines. Others imports made in local context.
24 import std.stdio;
25 import std.typecons : tuple, Tuple;
26 
27 // 'Heredoc' style help text. When printed it is followed by a getopt formatted option list.
28 auto helpText = q"EOS
29 Synopsis: tsv-select -f <field-list> [options] [file...]
30 
31 tsv-select reads files or standard input and writes specified fields to standard
32 output in the order listed. Similar to 'cut' with the ability to reorder fields.
33 
34 Fields numbers start with one. They are comma separated, and ranges can be used.
35 Fields can be listed more than once, and fields not listed can be output using
36 the --rest option. Multiple files with header lines can be managed with the
37 --header option, which retains the header of the first file and drops the rest.
38 
39 Examples:
40 
41    tsv-select -f 4,2,9 file1.tsv file2.tsv
42    tsv-select -f 1,4-7,11 file1.tsv
43    tsv-select -f 1,7-4,11 file1.tsv
44    tsv-select --delimiter ' ' -f 2,4,6 --rest last file1.txt
45    cat file*.tsv | tsv-select -f 3,2,1
46 
47 Options:
48 EOS";
49 
50 /** Container for command line options.
51  */
52 struct TsvSelectOptions
53 {
54     // The allowed values for the --rest option.
55     enum RestOptionVal { none, first, last };
56 
57     string programName;
58     bool hasHeader = false;     // --H|header
59     char delim = '\t';          // --d|delimiter
60     size_t[] fields;            // --f|fields
61     RestOptionVal rest;         // --rest none|first|last
62     bool versionWanted = false; // --V|version
63 
64     /** Process command line arguments (getopt cover).
65      *
66      * processArgs calls getopt to process command line arguments. It does any additional
67      * validation and parameter derivations needed. A tuple is returned. First value is
68      * true if command line arguments were successfully processed and execution should
69      * continue, or false if an error occurred or the user asked for help. If false, the
70      * second value is the appropriate exit code (0 or 1).
71      *
72      * Returning true (execution continues) means args have been validated and derived
73      * values calculated. In addition, field indices have been converted to zero-based.
74      */
75     auto processArgs (ref string[] cmdArgs)
76     {
77         import std.algorithm : any, each;
78         import std.getopt;
79         import std.path : baseName, stripExtension;
80         import std.typecons : Yes, No;
81         import tsv_utils.common.utils :  makeFieldListOptionHandler;
82 
83         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
84 
85         try
86         {
87             arraySep = ",";    // Use comma to separate values in command line options
88             auto r = getopt(
89                 cmdArgs,
90                 std.getopt.config.caseSensitive,
91                 "H|header",    "                 Treat the first line of each file as a header.", &hasHeader,
92                 std.getopt.config.caseInsensitive,
93 
94                 "f|fields",    "<field-list>     (Required) Fields to extract. Fields are output in the order listed.",
95                 fields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex),
96 
97                 "r|rest",      "none|first|last  Location for remaining fields. Default: none", &rest,
98                 "d|delimiter", "CHR              Character to use as field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
99                 std.getopt.config.caseSensitive,
100                 "V|version",   "                 Print version information and exit.", &versionWanted,
101                 std.getopt.config.caseInsensitive,
102                 );
103 
104             if (r.helpWanted)
105             {
106                 defaultGetoptPrinter(helpText, r.options);
107                 return tuple(false, 0);
108             }
109             else if (versionWanted)
110             {
111                 import tsv_utils.common.tsvutils_version;
112                 writeln(tsvutilsVersionNotice("tsv-select"));
113                 return tuple(false, 0);
114             }
115 
116             /* Consistency checks */
117             if (fields.length == 0)
118             {
119                 throw new Exception("Required option --f|fields was not supplied.");
120             }
121         }
122         catch (Exception exc)
123         {
124             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
125             return tuple(false, 1);
126         }
127         return tuple(true, 0);
128     }
129 }
130 
131 /** Main program.
132  */
133 int main(string[] cmdArgs)
134 {
135     /* When running in DMD code coverage mode, turn on report merging. */
136     version(D_Coverage) version(DigitalMars)
137     {
138         import core.runtime : dmd_coverSetMerge;
139         dmd_coverSetMerge(true);
140     }
141 
142     TsvSelectOptions cmdopt;
143     auto r = cmdopt.processArgs(cmdArgs);
144     if (!r[0]) return r[1];
145     version(LDC_Profile)
146     {
147         import ldc.profile : resetAll;
148         resetAll();
149     }
150     try
151     {
152         /* Invoke the tsvSelect template matching the --rest option chosen. Option args
153          * are removed by command line processing (getopt). The program name and any files
154          * remain. Pass the files to tsvSelect.
155          */
156         final switch (cmdopt.rest)
157         {
158         case TsvSelectOptions.RestOptionVal.none:
159             tsvSelect!(CTERestLocation.none)(cmdopt, cmdArgs[1..$]);
160             break;
161         case TsvSelectOptions.RestOptionVal.first:
162             tsvSelect!(CTERestLocation.first)(cmdopt, cmdArgs[1..$]);
163             break;
164         case TsvSelectOptions.RestOptionVal.last:
165             tsvSelect!(CTERestLocation.last)(cmdopt, cmdArgs[1..$]);
166             break;
167         }
168     }
169     catch (Exception exc)
170     {
171         stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
172         return 1;
173     }
174 
175     return 0;
176 }
177 
178 // tsvSelect
179 
180 /** Enumeration of the different specializations of the tsvSelect template.
181  *
182  * CTERestLocation is logically equivalent to the TsvSelectOptions.RestOptionVal enum. It
183  * is used by main to choose the appropriate tsvSelect template instantiation to call. It
184  * is distinct from the TsvSelectOptions enum to separate it from the end-user UI. The
185  * TsvSelectOptions version specifies the text of allowed values in command line arguments.
186  */
187 enum CTERestLocation { none, first, last };
188 
189 /** tsvSelect does the primary work of the tsv-select program.
190  *
191  * Input is read line by line, extracting the listed fields and writing them out in the order
192  * specified. An exception is thrown on error.
193  *
194  * This function is templatized with instantiations for the different --rest options. This
195  * avoids repeatedly running the same if-tests inside the inner loop. The main function
196  * instantiates this function three times, once for each of the --rest options. It results
197  * in a larger program, but is faster. Run-time improvements of 25% were measured compared
198  * to the non-templatized version. (Note: 'cte' stands for 'compile time evaluation'.)
199  */
200 void tsvSelect(CTERestLocation cteRest)(in TsvSelectOptions cmdopt, in string[] inputFiles)
201 {
202     import tsv_utils.common.utils: BufferedOutputRange, InputFieldReordering, throwIfWindowsNewlineOnUnix;
203     import std.algorithm: splitter;
204     import std.format: format;
205     import std.range;
206 
207     // Ensure the correct template instantiation was called.
208     static if (cteRest == CTERestLocation.none)
209         assert(cmdopt.rest == TsvSelectOptions.RestOptionVal.none);
210     else static if (cteRest == CTERestLocation.first)
211         assert(cmdopt.rest == TsvSelectOptions.RestOptionVal.first);
212     else static if (cteRest == CTERestLocation.last)
213         assert(cmdopt.rest == TsvSelectOptions.RestOptionVal.last);
214     else
215         static assert (false, "Unexpected cteRest value.");
216 
217     /* InputFieldReordering copies select fields from an input line to a new buffer.
218      * The buffer is reordered in the process.
219      */
220     auto fieldReordering = new InputFieldReordering!char(cmdopt.fields);
221 
222     /* Fields not on the --fields list are added to a separate buffer so they can be
223      * output as a group (the --rest option). This is done using an 'Appender', which
224      * is faster than the ~= operator. The Appender is passed a GC allocated buffer
225      * that grows as needed and is reused for each line. Typically it'll grow only
226      * on the first line.
227      */
228     static if (cteRest != CTERestLocation.none)
229     {
230         auto leftOverFieldsAppender = appender!(char[][]);
231     }
232 
233     /* BufferedOutputRange (from tsvutils.d) is a performance improvement over writing
234      * directly to stdout.
235      */
236     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
237 
238     /* Read each input file (or stdin) and iterate over each line. A filename of "-" is
239      * interpreted as stdin, common behavior for unix command line tools.
240      */
241     foreach (fileNum, filename; (inputFiles.length > 0) ? inputFiles : ["-"])
242     {
243         auto inputStream = (filename == "-") ? stdin : filename.File();
244         foreach (lineNum, line; inputStream.byLine.enumerate(1))
245         {
246             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum);
247 
248             if (lineNum == 1 && fileNum > 0 && cmdopt.hasHeader)
249             {
250                 continue;   // Drop the header line from all but the first file.
251             }
252             static if (cteRest != CTERestLocation.none)
253             {
254                 leftOverFieldsAppender.clear;
255             }
256             fieldReordering.initNewLine;
257             foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
258             {
259                 static if (cteRest == CTERestLocation.none)
260                 {
261                     fieldReordering.processNextField(fieldIndex, fieldValue);
262                     if (fieldReordering.allFieldsFilled) break;
263                 }
264                 else
265                 {
266                     auto numMatched = fieldReordering.processNextField(fieldIndex, fieldValue);
267                     if (numMatched == 0) leftOverFieldsAppender.put(fieldValue);
268                 }
269             }
270             // Finished with all fields in the line.
271             if (!fieldReordering.allFieldsFilled)
272             {
273                 throw new Exception(
274                     format("Not enough fields in line. File: %s,  Line: %s",
275                            (filename == "-") ? "Standard Input" : filename, lineNum));
276             }
277 
278             // Write the re-ordered line.
279 
280             static if (cteRest == CTERestLocation.first)
281             {
282                 if (leftOverFieldsAppender.data.length > 0)
283                 {
284                     bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim);
285                     bufferedOutput.append(cmdopt.delim);
286                 }
287             }
288 
289             bufferedOutput.joinAppend(fieldReordering.outputFields, cmdopt.delim);
290 
291             static if (cteRest == CTERestLocation.last)
292             {
293                 if (leftOverFieldsAppender.data.length > 0)
294                 {
295                     bufferedOutput.append(cmdopt.delim);
296                     bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim);
297                 }
298             }
299 
300             bufferedOutput.appendln;
301         }
302     }
303 }