1 /** 2 A variant of the unix 'cut' program, with the ability to reorder fields. 3 4 tsv-select is a variation on the Unix 'cut' utility, with the added ability to reorder 5 fields. Lines are read from files or standard input and split on a delimiter character. 6 Fields are written to standard output in the order listed. Fields can be listed more 7 than once, and fields not listed can be written out as a group. 8 9 This program is intended both as a useful utility and a D programming language example. 10 Functionality and constructs used include command line argument processing, file I/O, 11 exception handling, ranges, tuples and strings, templates, universal function call syntax 12 (UFCS), lambdas and functional programming constructs. Comments are more verbose than 13 typical to shed light on D programming constructs, but not to the level of a tutorial. 14 15 Copyright (c) 2015-2018, eBay Software Foundation 16 Initially written by Jon Degenhardt 17 18 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 19 */ 20 21 module tsv_utils.tsv_select; // Module name defaults to file name, but hyphens not allowed, so set it here. 22 23 // Imports used by multiple routines. Others imports made in local context. 24 import std.stdio; 25 import std.typecons : tuple, Tuple; 26 27 // 'Heredoc' style help text. When printed it is followed by a getopt formatted option list. 28 auto helpText = q"EOS 29 Synopsis: tsv-select -f <field-list> [options] [file...] 30 31 tsv-select reads files or standard input and writes specified fields to standard 32 output in the order listed. Similar to 'cut' with the ability to reorder fields. 33 34 Fields numbers start with one. They are comma separated, and ranges can be used. 35 Fields can be listed more than once, and fields not listed can be output using 36 the --rest option. Multiple files with header lines can be managed with the 37 --header option, which retains the header of the first file and drops the rest. 38 39 Examples: 40 41 tsv-select -f 4,2,9 file1.tsv file2.tsv 42 tsv-select -f 1,4-7,11 file1.tsv 43 tsv-select -f 1,7-4,11 file1.tsv 44 tsv-select --delimiter ' ' -f 2,4,6 --rest last file1.txt 45 cat file*.tsv | tsv-select -f 3,2,1 46 47 Options: 48 EOS"; 49 50 /** Container for command line options. 51 */ 52 struct TsvSelectOptions 53 { 54 // The allowed values for the --rest option. 55 enum RestOptionVal { none, first, last }; 56 57 string programName; 58 bool hasHeader = false; // --H|header 59 char delim = '\t'; // --d|delimiter 60 size_t[] fields; // --f|fields 61 RestOptionVal rest; // --rest none|first|last 62 bool versionWanted = false; // --V|version 63 64 /** Process command line arguments (getopt cover). 65 * 66 * processArgs calls getopt to process command line arguments. It does any additional 67 * validation and parameter derivations needed. A tuple is returned. First value is 68 * true if command line arguments were successfully processed and execution should 69 * continue, or false if an error occurred or the user asked for help. If false, the 70 * second value is the appropriate exit code (0 or 1). 71 * 72 * Returning true (execution continues) means args have been validated and derived 73 * values calculated. In addition, field indices have been converted to zero-based. 74 */ 75 auto processArgs (ref string[] cmdArgs) 76 { 77 import std.algorithm : any, each; 78 import std.getopt; 79 import std.path : baseName, stripExtension; 80 import std.typecons : Yes, No; 81 import tsv_utils.common.utils : makeFieldListOptionHandler; 82 83 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 84 85 try 86 { 87 arraySep = ","; // Use comma to separate values in command line options 88 auto r = getopt( 89 cmdArgs, 90 std.getopt.config.caseSensitive, 91 "H|header", " Treat the first line of each file as a header.", &hasHeader, 92 std.getopt.config.caseInsensitive, 93 94 "f|fields", "<field-list> (Required) Fields to extract. Fields are output in the order listed.", 95 fields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex), 96 97 "r|rest", "none|first|last Location for remaining fields. Default: none", &rest, 98 "d|delimiter", "CHR Character to use as field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 99 std.getopt.config.caseSensitive, 100 "V|version", " Print version information and exit.", &versionWanted, 101 std.getopt.config.caseInsensitive, 102 ); 103 104 if (r.helpWanted) 105 { 106 defaultGetoptPrinter(helpText, r.options); 107 return tuple(false, 0); 108 } 109 else if (versionWanted) 110 { 111 import tsv_utils.common.tsvutils_version; 112 writeln(tsvutilsVersionNotice("tsv-select")); 113 return tuple(false, 0); 114 } 115 116 /* Consistency checks */ 117 if (fields.length == 0) 118 { 119 throw new Exception("Required option --f|fields was not supplied."); 120 } 121 } 122 catch (Exception exc) 123 { 124 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 125 return tuple(false, 1); 126 } 127 return tuple(true, 0); 128 } 129 } 130 131 /** Main program. 132 */ 133 int main(string[] cmdArgs) 134 { 135 /* When running in DMD code coverage mode, turn on report merging. */ 136 version(D_Coverage) version(DigitalMars) 137 { 138 import core.runtime : dmd_coverSetMerge; 139 dmd_coverSetMerge(true); 140 } 141 142 TsvSelectOptions cmdopt; 143 auto r = cmdopt.processArgs(cmdArgs); 144 if (!r[0]) return r[1]; 145 version(LDC_Profile) 146 { 147 import ldc.profile : resetAll; 148 resetAll(); 149 } 150 try 151 { 152 /* Invoke the tsvSelect template matching the --rest option chosen. Option args 153 * are removed by command line processing (getopt). The program name and any files 154 * remain. Pass the files to tsvSelect. 155 */ 156 final switch (cmdopt.rest) 157 { 158 case TsvSelectOptions.RestOptionVal.none: 159 tsvSelect!(CTERestLocation.none)(cmdopt, cmdArgs[1..$]); 160 break; 161 case TsvSelectOptions.RestOptionVal.first: 162 tsvSelect!(CTERestLocation.first)(cmdopt, cmdArgs[1..$]); 163 break; 164 case TsvSelectOptions.RestOptionVal.last: 165 tsvSelect!(CTERestLocation.last)(cmdopt, cmdArgs[1..$]); 166 break; 167 } 168 } 169 catch (Exception exc) 170 { 171 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 172 return 1; 173 } 174 175 return 0; 176 } 177 178 // tsvSelect 179 180 /** Enumeration of the different specializations of the tsvSelect template. 181 * 182 * CTERestLocation is logically equivalent to the TsvSelectOptions.RestOptionVal enum. It 183 * is used by main to choose the appropriate tsvSelect template instantiation to call. It 184 * is distinct from the TsvSelectOptions enum to separate it from the end-user UI. The 185 * TsvSelectOptions version specifies the text of allowed values in command line arguments. 186 */ 187 enum CTERestLocation { none, first, last }; 188 189 /** tsvSelect does the primary work of the tsv-select program. 190 * 191 * Input is read line by line, extracting the listed fields and writing them out in the order 192 * specified. An exception is thrown on error. 193 * 194 * This function is templatized with instantiations for the different --rest options. This 195 * avoids repeatedly running the same if-tests inside the inner loop. The main function 196 * instantiates this function three times, once for each of the --rest options. It results 197 * in a larger program, but is faster. Run-time improvements of 25% were measured compared 198 * to the non-templatized version. (Note: 'cte' stands for 'compile time evaluation'.) 199 */ 200 void tsvSelect(CTERestLocation cteRest)(in TsvSelectOptions cmdopt, in string[] inputFiles) 201 { 202 import tsv_utils.common.utils: BufferedOutputRange, InputFieldReordering, throwIfWindowsNewlineOnUnix; 203 import std.algorithm: splitter; 204 import std.format: format; 205 import std.range; 206 207 // Ensure the correct template instantiation was called. 208 static if (cteRest == CTERestLocation.none) 209 assert(cmdopt.rest == TsvSelectOptions.RestOptionVal.none); 210 else static if (cteRest == CTERestLocation.first) 211 assert(cmdopt.rest == TsvSelectOptions.RestOptionVal.first); 212 else static if (cteRest == CTERestLocation.last) 213 assert(cmdopt.rest == TsvSelectOptions.RestOptionVal.last); 214 else 215 static assert (false, "Unexpected cteRest value."); 216 217 /* InputFieldReordering copies select fields from an input line to a new buffer. 218 * The buffer is reordered in the process. 219 */ 220 auto fieldReordering = new InputFieldReordering!char(cmdopt.fields); 221 222 /* Fields not on the --fields list are added to a separate buffer so they can be 223 * output as a group (the --rest option). This is done using an 'Appender', which 224 * is faster than the ~= operator. The Appender is passed a GC allocated buffer 225 * that grows as needed and is reused for each line. Typically it'll grow only 226 * on the first line. 227 */ 228 static if (cteRest != CTERestLocation.none) 229 { 230 auto leftOverFieldsAppender = appender!(char[][]); 231 } 232 233 /* BufferedOutputRange (from tsvutils.d) is a performance improvement over writing 234 * directly to stdout. 235 */ 236 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 237 238 /* Read each input file (or stdin) and iterate over each line. A filename of "-" is 239 * interpreted as stdin, common behavior for unix command line tools. 240 */ 241 foreach (fileNum, filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 242 { 243 auto inputStream = (filename == "-") ? stdin : filename.File(); 244 foreach (lineNum, line; inputStream.byLine.enumerate(1)) 245 { 246 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum); 247 248 if (lineNum == 1 && fileNum > 0 && cmdopt.hasHeader) 249 { 250 continue; // Drop the header line from all but the first file. 251 } 252 static if (cteRest != CTERestLocation.none) 253 { 254 leftOverFieldsAppender.clear; 255 } 256 fieldReordering.initNewLine; 257 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 258 { 259 static if (cteRest == CTERestLocation.none) 260 { 261 fieldReordering.processNextField(fieldIndex, fieldValue); 262 if (fieldReordering.allFieldsFilled) break; 263 } 264 else 265 { 266 auto numMatched = fieldReordering.processNextField(fieldIndex, fieldValue); 267 if (numMatched == 0) leftOverFieldsAppender.put(fieldValue); 268 } 269 } 270 // Finished with all fields in the line. 271 if (!fieldReordering.allFieldsFilled) 272 { 273 throw new Exception( 274 format("Not enough fields in line. File: %s, Line: %s", 275 (filename == "-") ? "Standard Input" : filename, lineNum)); 276 } 277 278 // Write the re-ordered line. 279 280 static if (cteRest == CTERestLocation.first) 281 { 282 if (leftOverFieldsAppender.data.length > 0) 283 { 284 bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim); 285 bufferedOutput.append(cmdopt.delim); 286 } 287 } 288 289 bufferedOutput.joinAppend(fieldReordering.outputFields, cmdopt.delim); 290 291 static if (cteRest == CTERestLocation.last) 292 { 293 if (leftOverFieldsAppender.data.length > 0) 294 { 295 bufferedOutput.append(cmdopt.delim); 296 bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim); 297 } 298 } 299 300 bufferedOutput.appendln; 301 } 302 } 303 }