1 /**
2 A variant of the unix 'cut' program, with the ability to reorder fields.
3 
4 tsv-select is a variation on the Unix 'cut' utility, with the added ability to reorder
5 fields. Lines are read from files or standard input and split on a delimiter character.
6 Fields are written to standard output in the order listed. Fields can be listed more
7 than once, and fields not listed can be written out as a group.
8 
9 This program is intended both as a useful utility and a D programming language example.
10 Functionality and constructs used include command line argument processing, file I/O,
11 exception handling, ranges, tuples and strings, templates, universal function call syntax
12 (UFCS), lambdas and functional programming constructs. Comments are more verbose than
13 typical to shed light on D programming constructs, but not to the level of a tutorial.
14 
15 Copyright (c) 2015-2019, eBay Software Foundation
16 Initially written by Jon Degenhardt
17 
18 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
19 */
20 
21 module tsv_utils.tsv_select;   // Module name defaults to file name, but hyphens not allowed, so set it here.
22 
23 // Imports used by multiple routines. Others imports made in local context.
24 import std.stdio;
25 import std.typecons : tuple, Tuple;
26 
27 // 'Heredoc' style help text. When printed it is followed by a getopt formatted option list.
28 immutable helpText = q"EOS
29 Synopsis: tsv-select -f <field-list> [options] [file...]
30 
31 tsv-select reads files or standard input and writes specified fields to standard
32 output in the order listed. Similar to 'cut' with the ability to reorder fields.
33 
34 Fields numbers start with one. They are comma separated, and ranges can be used.
35 Fields can be listed more than once, and fields not listed can be output using
36 the --rest option. Multiple files with header lines can be managed with the
37 --header option, which retains the header of the first file and drops the rest.
38 
39 Examples:
40 
41    tsv-select -f 4,2,9 file1.tsv file2.tsv
42    tsv-select -f 1,4-7,11 file1.tsv
43    tsv-select -f 1,7-4,11 file1.tsv
44    tsv-select --delimiter ' ' -f 2,4,6 --rest last file1.txt
45    cat file*.tsv | tsv-select -f 3,2,1
46 
47 Options:
48 EOS";
49 
50 /** Container for command line options.
51  */
52 struct TsvSelectOptions
53 {
54     // The allowed values for the --rest option.
55     enum RestOptionVal { none, first, last };
56 
57     string programName;
58     bool hasHeader = false;     // --H|header
59     char delim = '\t';          // --d|delimiter
60     size_t[] fields;            // --f|fields
61     RestOptionVal rest;         // --rest none|first|last
62     bool versionWanted = false; // --V|version
63 
64     /** Process command line arguments (getopt cover).
65      *
66      * processArgs calls getopt to process command line arguments. It does any additional
67      * validation and parameter derivations needed. A tuple is returned. First value is
68      * true if command line arguments were successfully processed and execution should
69      * continue, or false if an error occurred or the user asked for help. If false, the
70      * second value is the appropriate exit code (0 or 1).
71      *
72      * Returning true (execution continues) means args have been validated and derived
73      * values calculated. In addition, field indices have been converted to zero-based.
74      */
75     auto processArgs (ref string[] cmdArgs)
76     {
77         import std.algorithm : any, each;
78         import std.getopt;
79         import std.path : baseName, stripExtension;
80         import std.typecons : Yes, No;
81         import tsv_utils.common.utils :  makeFieldListOptionHandler;
82 
83         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
84 
85         try
86         {
87             arraySep = ",";    // Use comma to separate values in command line options
88             auto r = getopt(
89                 cmdArgs,
90                 std.getopt.config.caseSensitive,
91                 "H|header",    "                 Treat the first line of each file as a header.", &hasHeader,
92                 std.getopt.config.caseInsensitive,
93 
94                 "f|fields",    "<field-list>     (Required) Fields to extract. Fields are output in the order listed.",
95                 fields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex),
96 
97                 "r|rest",      "none|first|last  Location for remaining fields. Default: none", &rest,
98                 "d|delimiter", "CHR              Character to use as field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
99                 std.getopt.config.caseSensitive,
100                 "V|version",   "                 Print version information and exit.", &versionWanted,
101                 std.getopt.config.caseInsensitive,
102                 );
103 
104             if (r.helpWanted)
105             {
106                 defaultGetoptPrinter(helpText, r.options);
107                 return tuple(false, 0);
108             }
109             else if (versionWanted)
110             {
111                 import tsv_utils.common.tsvutils_version;
112                 writeln(tsvutilsVersionNotice("tsv-select"));
113                 return tuple(false, 0);
114             }
115 
116             /* Consistency checks */
117             if (fields.length == 0)
118             {
119                 throw new Exception("Required option --f|fields was not supplied.");
120             }
121         }
122         catch (Exception exc)
123         {
124             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
125             return tuple(false, 1);
126         }
127         return tuple(true, 0);
128     }
129 }
130 
131 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
132 
133 /** Main program.
134  */
135 int main(string[] cmdArgs)
136 {
137     /* When running in DMD code coverage mode, turn on report merging. */
138     version(D_Coverage) version(DigitalMars)
139     {
140         import core.runtime : dmd_coverSetMerge;
141         dmd_coverSetMerge(true);
142     }
143 
144     TsvSelectOptions cmdopt;
145     const r = cmdopt.processArgs(cmdArgs);
146     if (!r[0]) return r[1];
147     version(LDC_Profile)
148     {
149         import ldc.profile : resetAll;
150         resetAll();
151     }
152     try
153     {
154         /* Invoke the tsvSelect template matching the --rest option chosen. Option args
155          * are removed by command line processing (getopt). The program name and any files
156          * remain. Pass the files to tsvSelect.
157          */
158         final switch (cmdopt.rest)
159         {
160         case TsvSelectOptions.RestOptionVal.none:
161             tsvSelect!(CTERestLocation.none)(cmdopt, cmdArgs[1..$]);
162             break;
163         case TsvSelectOptions.RestOptionVal.first:
164             tsvSelect!(CTERestLocation.first)(cmdopt, cmdArgs[1..$]);
165             break;
166         case TsvSelectOptions.RestOptionVal.last:
167             tsvSelect!(CTERestLocation.last)(cmdopt, cmdArgs[1..$]);
168             break;
169         }
170     }
171     catch (Exception exc)
172     {
173         stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
174         return 1;
175     }
176 
177     return 0;
178 }
179 
180 // tsvSelect
181 
182 /** Enumeration of the different specializations of the tsvSelect template.
183  *
184  * CTERestLocation is logically equivalent to the TsvSelectOptions.RestOptionVal enum. It
185  * is used by main to choose the appropriate tsvSelect template instantiation to call. It
186  * is distinct from the TsvSelectOptions enum to separate it from the end-user UI. The
187  * TsvSelectOptions version specifies the text of allowed values in command line arguments.
188  */
189 enum CTERestLocation { none, first, last };
190 
191 /** tsvSelect does the primary work of the tsv-select program.
192  *
193  * Input is read line by line, extracting the listed fields and writing them out in the order
194  * specified. An exception is thrown on error.
195  *
196  * This function is templatized with instantiations for the different --rest options. This
197  * avoids repeatedly running the same if-tests inside the inner loop. The main function
198  * instantiates this function three times, once for each of the --rest options. It results
199  * in a larger program, but is faster. Run-time improvements of 25% were measured compared
200  * to the non-templatized version. (Note: 'cte' stands for 'compile time evaluation'.)
201  */
202 void tsvSelect(CTERestLocation cteRest)(in TsvSelectOptions cmdopt, in string[] inputFiles)
203 {
204     import tsv_utils.common.utils: BufferedOutputRange, bufferedByLine, InputFieldReordering, throwIfWindowsNewlineOnUnix;
205     import std.algorithm: splitter;
206     import std.format: format;
207     import std.range;
208 
209     // Ensure the correct template instantiation was called.
210     static if (cteRest == CTERestLocation.none)
211         assert(cmdopt.rest == TsvSelectOptions.RestOptionVal.none);
212     else static if (cteRest == CTERestLocation.first)
213         assert(cmdopt.rest == TsvSelectOptions.RestOptionVal.first);
214     else static if (cteRest == CTERestLocation.last)
215         assert(cmdopt.rest == TsvSelectOptions.RestOptionVal.last);
216     else
217         static assert (false, "Unexpected cteRest value.");
218 
219     /* InputFieldReordering copies select fields from an input line to a new buffer.
220      * The buffer is reordered in the process.
221      */
222     auto fieldReordering = new InputFieldReordering!char(cmdopt.fields);
223 
224     /* Fields not on the --fields list are added to a separate buffer so they can be
225      * output as a group (the --rest option). This is done using an 'Appender', which
226      * is faster than the ~= operator. The Appender is passed a GC allocated buffer
227      * that grows as needed and is reused for each line. Typically it'll grow only
228      * on the first line.
229      */
230     static if (cteRest != CTERestLocation.none)
231     {
232         auto leftOverFieldsAppender = appender!(char[][]);
233     }
234 
235     /* BufferedOutputRange (from tsvutils.d) is a performance improvement over writing
236      * directly to stdout.
237      */
238     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
239 
240     /* Read each input file (or stdin) and iterate over each line. A filename of "-" is
241      * interpreted as stdin, common behavior for unix command line tools.
242      */
243     foreach (fileNum, filename; (inputFiles.length > 0) ? inputFiles : ["-"])
244     {
245         auto inputStream = (filename == "-") ? stdin : filename.File();
246         foreach (lineNum, line; inputStream.bufferedByLine.enumerate(1))
247         {
248             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum);
249 
250             if (lineNum == 1 && fileNum > 0 && cmdopt.hasHeader)
251             {
252                 continue;   // Drop the header line from all but the first file.
253             }
254             static if (cteRest != CTERestLocation.none)
255             {
256                 leftOverFieldsAppender.clear;
257             }
258             fieldReordering.initNewLine;
259             foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
260             {
261                 static if (cteRest == CTERestLocation.none)
262                 {
263                     fieldReordering.processNextField(fieldIndex, fieldValue);
264                     if (fieldReordering.allFieldsFilled) break;
265                 }
266                 else
267                 {
268                     immutable numMatched = fieldReordering.processNextField(fieldIndex, fieldValue);
269                     if (numMatched == 0) leftOverFieldsAppender.put(fieldValue);
270                 }
271             }
272             // Finished with all fields in the line.
273             if (!fieldReordering.allFieldsFilled)
274             {
275                 throw new Exception(
276                     format("Not enough fields in line. File: %s,  Line: %s",
277                            (filename == "-") ? "Standard Input" : filename, lineNum));
278             }
279 
280             // Write the re-ordered line.
281 
282             static if (cteRest == CTERestLocation.first)
283             {
284                 if (leftOverFieldsAppender.data.length > 0)
285                 {
286                     bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim);
287                     bufferedOutput.append(cmdopt.delim);
288                 }
289             }
290 
291             bufferedOutput.joinAppend(fieldReordering.outputFields, cmdopt.delim);
292 
293             static if (cteRest == CTERestLocation.last)
294             {
295                 if (leftOverFieldsAppender.data.length > 0)
296                 {
297                     bufferedOutput.append(cmdopt.delim);
298                     bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim);
299                 }
300             }
301 
302             bufferedOutput.appendln;
303         }
304     }
305 }