1 /**
2 A variant of the unix 'cut' program, with the ability to reorder fields.
3 
4 tsv-select is a variation on the Unix 'cut' utility, with the added ability to reorder
5 fields. Lines are read from files or standard input and split on a delimiter character.
6 Fields are written to standard output in the order listed. Fields can be listed more
7 than once, and fields not listed can be written out as a group.
8 
9 This program is intended both as a useful utility and a D programming language example.
10 Functionality and constructs used include command line argument processing, file I/O,
11 exception handling, ranges, tuples and strings, universal function call syntax (UFCS),
12 lambdas and functional programming constructs. Comments are more verbose than typical
13 to shed light on D programming constructs, but not to the level of a tutorial.
14 
15 Copyright (c) 2015-2016, eBay Software Foundation
16 Initially written by Jon Degenhardt
17 
18 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
19 */
20 
21 // Module name defaults to file name, but hyphens not allowed, so set it here.
22 module tsv_select;
23 
24 // Imports used by multiple routines. Others imports made in local context.
25 import std.stdio;
26 import std.typecons : tuple, Tuple;
27 
28 // 'Heredoc' style help text. When printed it is followed by a getopt formatted option list.
29 auto helpText = q"EOS
30 Synopsis: tsv-select -f n[,n...] [options] [file...]
31 
32 tsv-select reads files or standard input and writes specified fields to
33 standard output in the order listed. Similar to 'cut' with the ability to
34 reorder fields. Fields can be listed more than once, and fields not
35 listed can be output using the --rest option. Examples:
36 
37    tsv-select -f 4,2,9 file1.tsv file2.tsv
38    tsv-select --delimiter ' ' -f 2,4,6 --rest last file1.txt
39    cat file*.tsv | tsv-select -f 3,2,1
40 
41 Options:
42 EOS";
43 
44 /** 
45 Container for command line options. 
46  */
47 struct TsvSelectOptions {
48     enum RestOptionVal { none, first, last };  // Values allowed in --rest option.
49     
50     char delim = '\t';
51     size_t[] fields;
52     RestOptionVal rest;
53 
54     /* Returns a tuple. First value is true if command line arguments were successfully
55      * processed and execution should continue, or false if an error occurred or the user
56      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
57      *
58      * Returning true (execution continues) means args have been validated and derived
59      * values calculated. In addition, field indices have been converted to zero-based.
60      */ 
61     auto processArgs (ref string[] cmdArgs) {
62         import std.algorithm : any, each;
63         import std.getopt;
64         
65         try {
66             arraySep = ",";    // Use comma to separate values in command line options
67             auto r = getopt(
68                 cmdArgs,
69                 "f|fields",    "n[,n...]         (Required) Fields to extract. Fields are output in the order listed.", &fields,
70                 "r|rest",      "none|first|last  Location for remaining fields. Default: none", &rest,
71                 "d|delimiter", "CHR              Character to use as field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim
72                 );
73             
74             if (r.helpWanted) {
75                 defaultGetoptPrinter(helpText, r.options);
76                 return tuple(false, 0);
77             }
78         
79             /* Consistency checks */
80             if (fields.length == 0) {
81                 throw new Exception("Required option --f|fields was not supplied.");
82             }
83         
84             if (fields.length > 0 && fields.any!(x => x == 0)) {
85                 throw new Exception("Zero is not a valid field number (--f|fields).");
86             }
87 
88             /* Derivations */
89             fields.each!((ref x) => --x);  // Convert to 1-based indexing. Using 'ref' in the lambda allows the actual
90                                            // field value to be modified. Otherwise a copy would be passed.
91             
92         } catch (Exception exc) {
93             stderr.writeln("Error processing command line arguments: ", exc.msg);
94             return tuple(false, 1);
95         }
96         return tuple(true, 0);
97     }
98 }
99 
100 /**
101 Main program.
102  */
103 int main(string[] cmdArgs) {
104     TsvSelectOptions cmdopt;
105     auto r = cmdopt.processArgs(cmdArgs);
106     if (!r[0]) {
107         return r[1];
108     }
109     try {
110         /* Option args are removed by command line processing (getopt). The program name
111          * and any files remain. Pass the files to tsvSelect.
112          */
113         tsvSelect(cmdopt, cmdArgs[1..$]);
114     }
115     catch (Exception exc) {
116         stderr.writeln("Error: ", exc.msg);
117         return 1;
118     }
119 
120     return 0;
121 }
122 
123 /**
124 tsvSelect does the primary work of the tsv-select program.
125  
126 Input is read line by line, extracting the listed fields and writing them
127 out in the order specified. An exception is thrown on error.
128  */
129 void tsvSelect(in TsvSelectOptions cmdopt, in string[] inputFiles) {
130     import tsvutil: InputFieldReordering;
131     import std.algorithm: splitter;
132     import std.format: format;
133     import std.range;
134 
135     /* InputFieldReordering copies select fields from an input line to a new buffer.
136      * The buffer is reordered in the process.
137      */
138     auto fieldReordering = new InputFieldReordering!char(cmdopt.fields);
139 
140     /* Fields not on the --fields list are added to a separate buffer so they can be
141      * output as a group (the --rest option). This is done using an 'Appender', which
142      * is faster than the ~= operator. The Appender is passed a GC allocated buffer
143      * that grows as needed and is reused for each line. Typically it'll grow only
144      * on the first line.
145      */
146     bool keepingLeftOverFields = cmdopt.rest != TsvSelectOptions.RestOptionVal.none;
147     auto leftOverFieldsAppender = appender!(char[][]); 
148 
149     /* Read each input file (or stdin) and iterate over each line. A filename of "-" is
150      * interpreted as stdin, common behavior for unix command line tools.
151      */
152     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) {
153         auto inputStream = (filename == "-") ? stdin : filename.File();
154         foreach (lineNum, line; inputStream.byLine.enumerate(1)) {
155             leftOverFieldsAppender.clear;
156             fieldReordering.initNewLine;
157             foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) {
158                 auto numMatched = fieldReordering.processNextField(fieldIndex, fieldValue);
159                 if (numMatched == 0) {
160                     if (keepingLeftOverFields) {
161                         leftOverFieldsAppender.put(fieldValue);
162                     }
163                 }
164                 else if (fieldReordering.allFieldsFilled && !keepingLeftOverFields) {
165                     break;
166                 }
167             }
168             // Finished with all fields in the line.
169             if (!fieldReordering.allFieldsFilled) {
170                 throw new Exception(
171                     format("Not enough fields in line. File: %s,  Line: %s",
172                            (filename == "-") ? "Standard Input" : filename, lineNum));
173             }
174 
175             // Write the re-ordered line. The prefix/suffix setup is needed for chain's api.
176             auto prefix = (cmdopt.rest == TsvSelectOptions.RestOptionVal.first) ? leftOverFieldsAppender.data : [];
177             auto suffix = (cmdopt.rest == TsvSelectOptions.RestOptionVal.last) ? leftOverFieldsAppender.data : [];
178             chain(prefix, fieldReordering.outputFields, suffix).join(cmdopt.delim).writeln;
179         }
180     }
181 }