1 /** 2 A variant of the unix 'cut' program, with the ability to reorder fields. 3 4 tsv-select is a variation on the Unix 'cut' utility, with the added ability to reorder 5 fields. Lines are read from files or standard input and split on a delimiter character. 6 Fields are written to standard output in the order listed. Fields can be listed more 7 than once, and fields not listed can be written out as a group. 8 9 This program is intended both as a useful utility and a D programming language example. 10 Functionality and constructs used include command line argument processing, file I/O, 11 exception handling, ranges, tuples and strings, universal function call syntax (UFCS), 12 lambdas and functional programming constructs. Comments are more verbose than typical 13 to shed light on D programming constructs, but not to the level of a tutorial. 14 15 Copyright (c) 2015-2016, eBay Software Foundation 16 Initially written by Jon Degenhardt 17 18 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 19 */ 20 21 // Module name defaults to file name, but hyphens not allowed, so set it here. 22 module tsv_select; 23 24 // Imports used by multiple routines. Others imports made in local context. 25 import std.stdio; 26 import std.typecons : tuple, Tuple; 27 28 // 'Heredoc' style help text. When printed it is followed by a getopt formatted option list. 29 auto helpText = q"EOS 30 Synopsis: tsv-select -f n[,n...] [options] [file...] 31 32 tsv-select reads files or standard input and writes specified fields to 33 standard output in the order listed. Similar to 'cut' with the ability to 34 reorder fields. Fields can be listed more than once, and fields not 35 listed can be output using the --rest option. Examples: 36 37 tsv-select -f 4,2,9 file1.tsv file2.tsv 38 tsv-select --delimiter ' ' -f 2,4,6 --rest last file1.txt 39 cat file*.tsv | tsv-select -f 3,2,1 40 41 Options: 42 EOS"; 43 44 /** 45 Container for command line options. 46 */ 47 struct TsvSelectOptions { 48 enum RestOptionVal { none, first, last }; // Values allowed in --rest option. 49 50 char delim = '\t'; 51 size_t[] fields; 52 RestOptionVal rest; 53 54 /* Returns a tuple. First value is true if command line arguments were successfully 55 * processed and execution should continue, or false if an error occurred or the user 56 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 57 * 58 * Returning true (execution continues) means args have been validated and derived 59 * values calculated. In addition, field indices have been converted to zero-based. 60 */ 61 auto processArgs (ref string[] cmdArgs) { 62 import std.algorithm : any, each; 63 import std.getopt; 64 65 try { 66 arraySep = ","; // Use comma to separate values in command line options 67 auto r = getopt( 68 cmdArgs, 69 "f|fields", "n[,n...] (Required) Fields to extract. Fields are output in the order listed.", &fields, 70 "r|rest", "none|first|last Location for remaining fields. Default: none", &rest, 71 "d|delimiter", "CHR Character to use as field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim 72 ); 73 74 if (r.helpWanted) { 75 defaultGetoptPrinter(helpText, r.options); 76 return tuple(false, 0); 77 } 78 79 /* Consistency checks */ 80 if (fields.length == 0) { 81 throw new Exception("Required option --f|fields was not supplied."); 82 } 83 84 if (fields.length > 0 && fields.any!(x => x == 0)) { 85 throw new Exception("Zero is not a valid field number (--f|fields)."); 86 } 87 88 /* Derivations */ 89 fields.each!((ref x) => --x); // Convert to 1-based indexing. Using 'ref' in the lambda allows the actual 90 // field value to be modified. Otherwise a copy would be passed. 91 92 } catch (Exception exc) { 93 stderr.writeln("Error processing command line arguments: ", exc.msg); 94 return tuple(false, 1); 95 } 96 return tuple(true, 0); 97 } 98 } 99 100 /** 101 Main program. 102 */ 103 int main(string[] cmdArgs) { 104 TsvSelectOptions cmdopt; 105 auto r = cmdopt.processArgs(cmdArgs); 106 if (!r[0]) { 107 return r[1]; 108 } 109 try { 110 /* Option args are removed by command line processing (getopt). The program name 111 * and any files remain. Pass the files to tsvSelect. 112 */ 113 tsvSelect(cmdopt, cmdArgs[1..$]); 114 } 115 catch (Exception exc) { 116 stderr.writeln("Error: ", exc.msg); 117 return 1; 118 } 119 120 return 0; 121 } 122 123 /** 124 tsvSelect does the primary work of the tsv-select program. 125 126 Input is read line by line, extracting the listed fields and writing them 127 out in the order specified. An exception is thrown on error. 128 */ 129 void tsvSelect(in TsvSelectOptions cmdopt, in string[] inputFiles) { 130 import tsvutil: InputFieldReordering; 131 import std.algorithm: splitter; 132 import std.format: format; 133 import std.range; 134 135 /* InputFieldReordering copies select fields from an input line to a new buffer. 136 * The buffer is reordered in the process. 137 */ 138 auto fieldReordering = new InputFieldReordering!char(cmdopt.fields); 139 140 /* Fields not on the --fields list are added to a separate buffer so they can be 141 * output as a group (the --rest option). This is done using an 'Appender', which 142 * is faster than the ~= operator. The Appender is passed a GC allocated buffer 143 * that grows as needed and is reused for each line. Typically it'll grow only 144 * on the first line. 145 */ 146 bool keepingLeftOverFields = cmdopt.rest != TsvSelectOptions.RestOptionVal.none; 147 auto leftOverFieldsAppender = appender!(char[][]); 148 149 /* Read each input file (or stdin) and iterate over each line. A filename of "-" is 150 * interpreted as stdin, common behavior for unix command line tools. 151 */ 152 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) { 153 auto inputStream = (filename == "-") ? stdin : filename.File(); 154 foreach (lineNum, line; inputStream.byLine.enumerate(1)) { 155 leftOverFieldsAppender.clear; 156 fieldReordering.initNewLine; 157 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) { 158 auto numMatched = fieldReordering.processNextField(fieldIndex, fieldValue); 159 if (numMatched == 0) { 160 if (keepingLeftOverFields) { 161 leftOverFieldsAppender.put(fieldValue); 162 } 163 } 164 else if (fieldReordering.allFieldsFilled && !keepingLeftOverFields) { 165 break; 166 } 167 } 168 // Finished with all fields in the line. 169 if (!fieldReordering.allFieldsFilled) { 170 throw new Exception( 171 format("Not enough fields in line. File: %s, Line: %s", 172 (filename == "-") ? "Standard Input" : filename, lineNum)); 173 } 174 175 // Write the re-ordered line. The prefix/suffix setup is needed for chain's api. 176 auto prefix = (cmdopt.rest == TsvSelectOptions.RestOptionVal.first) ? leftOverFieldsAppender.data : []; 177 auto suffix = (cmdopt.rest == TsvSelectOptions.RestOptionVal.last) ? leftOverFieldsAppender.data : []; 178 chain(prefix, fieldReordering.outputFields, suffix).join(cmdopt.delim).writeln; 179 } 180 } 181 }