1 /** 2 A variant of the unix 'cut' program, with the ability to reorder fields. 3 4 tsv-select is a variation on the Unix 'cut' utility, with the added ability to reorder 5 fields. Lines are read from files or standard input and split on a delimiter character. 6 Fields are written to standard output in the order listed. Fields can be listed more 7 than once, and fields not listed can be written out as a group. 8 9 This program is intended both as a useful utility and a D programming language example. 10 Functionality and constructs used include command line argument processing, file I/O, 11 exception handling, ranges, tuples and strings, templates, universal function call syntax 12 (UFCS), lambdas and functional programming constructs. Comments are more verbose than 13 typical to shed light on D programming constructs, but not to the level of a tutorial. 14 15 Copyright (c) 2015-2019, eBay Software Foundation 16 Initially written by Jon Degenhardt 17 18 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 19 */ 20 21 module tsv_utils.tsv_select; // Module name defaults to file name, but hyphens not allowed, so set it here. 22 23 // Imports used by multiple routines. Others imports made in local context. 24 import std.stdio; 25 import std.typecons : tuple, Tuple; 26 27 // 'Heredoc' style help text. When printed it is followed by a getopt formatted option list. 28 immutable helpText = q"EOS 29 Synopsis: tsv-select -f <field-list> [options] [file...] 30 31 tsv-select reads files or standard input and writes specified fields to standard 32 output in the order listed. Similar to 'cut' with the ability to reorder fields. 33 34 Fields numbers start with one. They are comma separated, and ranges can be used. 35 Fields can be listed more than once, and fields not listed can be output using 36 the --rest option. Multiple files with header lines can be managed with the 37 --header option, which retains the header of the first file and drops the rest. 38 39 Examples: 40 41 tsv-select -f 4,2,9 file1.tsv file2.tsv 42 tsv-select -f 1,4-7,11 file1.tsv 43 tsv-select -f 1,7-4,11 file1.tsv 44 tsv-select --delimiter ' ' -f 2,4,6 --rest last file1.txt 45 cat file*.tsv | tsv-select -f 3,2,1 46 47 Options: 48 EOS"; 49 50 /** Container for command line options. 51 */ 52 struct TsvSelectOptions 53 { 54 // The allowed values for the --rest option. 55 enum RestOptionVal { none, first, last }; 56 57 string programName; 58 bool hasHeader = false; // --H|header 59 char delim = '\t'; // --d|delimiter 60 size_t[] fields; // --f|fields 61 RestOptionVal rest; // --rest none|first|last 62 bool versionWanted = false; // --V|version 63 64 /** Process command line arguments (getopt cover). 65 * 66 * processArgs calls getopt to process command line arguments. It does any additional 67 * validation and parameter derivations needed. A tuple is returned. First value is 68 * true if command line arguments were successfully processed and execution should 69 * continue, or false if an error occurred or the user asked for help. If false, the 70 * second value is the appropriate exit code (0 or 1). 71 * 72 * Returning true (execution continues) means args have been validated and derived 73 * values calculated. In addition, field indices have been converted to zero-based. 74 */ 75 auto processArgs (ref string[] cmdArgs) 76 { 77 import std.algorithm : any, each; 78 import std.getopt; 79 import std.path : baseName, stripExtension; 80 import std.typecons : Yes, No; 81 import tsv_utils.common.utils : makeFieldListOptionHandler; 82 83 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 84 85 try 86 { 87 arraySep = ","; // Use comma to separate values in command line options 88 auto r = getopt( 89 cmdArgs, 90 std.getopt.config.caseSensitive, 91 "H|header", " Treat the first line of each file as a header.", &hasHeader, 92 std.getopt.config.caseInsensitive, 93 94 "f|fields", "<field-list> (Required) Fields to extract. Fields are output in the order listed.", 95 fields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex), 96 97 "r|rest", "none|first|last Location for remaining fields. Default: none", &rest, 98 "d|delimiter", "CHR Character to use as field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 99 std.getopt.config.caseSensitive, 100 "V|version", " Print version information and exit.", &versionWanted, 101 std.getopt.config.caseInsensitive, 102 ); 103 104 if (r.helpWanted) 105 { 106 defaultGetoptPrinter(helpText, r.options); 107 return tuple(false, 0); 108 } 109 else if (versionWanted) 110 { 111 import tsv_utils.common.tsvutils_version; 112 writeln(tsvutilsVersionNotice("tsv-select")); 113 return tuple(false, 0); 114 } 115 116 /* Consistency checks */ 117 if (fields.length == 0) 118 { 119 throw new Exception("Required option --f|fields was not supplied."); 120 } 121 } 122 catch (Exception exc) 123 { 124 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 125 return tuple(false, 1); 126 } 127 return tuple(true, 0); 128 } 129 } 130 131 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 132 133 /** Main program. 134 */ 135 int main(string[] cmdArgs) 136 { 137 /* When running in DMD code coverage mode, turn on report merging. */ 138 version(D_Coverage) version(DigitalMars) 139 { 140 import core.runtime : dmd_coverSetMerge; 141 dmd_coverSetMerge(true); 142 } 143 144 TsvSelectOptions cmdopt; 145 const r = cmdopt.processArgs(cmdArgs); 146 if (!r[0]) return r[1]; 147 version(LDC_Profile) 148 { 149 import ldc.profile : resetAll; 150 resetAll(); 151 } 152 try 153 { 154 /* Invoke the tsvSelect template matching the --rest option chosen. Option args 155 * are removed by command line processing (getopt). The program name and any files 156 * remain. Pass the files to tsvSelect. 157 */ 158 final switch (cmdopt.rest) 159 { 160 case TsvSelectOptions.RestOptionVal.none: 161 tsvSelect!(CTERestLocation.none)(cmdopt, cmdArgs[1..$]); 162 break; 163 case TsvSelectOptions.RestOptionVal.first: 164 tsvSelect!(CTERestLocation.first)(cmdopt, cmdArgs[1..$]); 165 break; 166 case TsvSelectOptions.RestOptionVal.last: 167 tsvSelect!(CTERestLocation.last)(cmdopt, cmdArgs[1..$]); 168 break; 169 } 170 } 171 catch (Exception exc) 172 { 173 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 174 return 1; 175 } 176 177 return 0; 178 } 179 180 // tsvSelect 181 182 /** Enumeration of the different specializations of the tsvSelect template. 183 * 184 * CTERestLocation is logically equivalent to the TsvSelectOptions.RestOptionVal enum. It 185 * is used by main to choose the appropriate tsvSelect template instantiation to call. It 186 * is distinct from the TsvSelectOptions enum to separate it from the end-user UI. The 187 * TsvSelectOptions version specifies the text of allowed values in command line arguments. 188 */ 189 enum CTERestLocation { none, first, last }; 190 191 /** tsvSelect does the primary work of the tsv-select program. 192 * 193 * Input is read line by line, extracting the listed fields and writing them out in the order 194 * specified. An exception is thrown on error. 195 * 196 * This function is templatized with instantiations for the different --rest options. This 197 * avoids repeatedly running the same if-tests inside the inner loop. The main function 198 * instantiates this function three times, once for each of the --rest options. It results 199 * in a larger program, but is faster. Run-time improvements of 25% were measured compared 200 * to the non-templatized version. (Note: 'cte' stands for 'compile time evaluation'.) 201 */ 202 void tsvSelect(CTERestLocation cteRest)(in TsvSelectOptions cmdopt, in string[] inputFiles) 203 { 204 import tsv_utils.common.utils: BufferedOutputRange, bufferedByLine, InputFieldReordering, throwIfWindowsNewlineOnUnix; 205 import std.algorithm: splitter; 206 import std.format: format; 207 import std.range; 208 209 // Ensure the correct template instantiation was called. 210 static if (cteRest == CTERestLocation.none) 211 assert(cmdopt.rest == TsvSelectOptions.RestOptionVal.none); 212 else static if (cteRest == CTERestLocation.first) 213 assert(cmdopt.rest == TsvSelectOptions.RestOptionVal.first); 214 else static if (cteRest == CTERestLocation.last) 215 assert(cmdopt.rest == TsvSelectOptions.RestOptionVal.last); 216 else 217 static assert (false, "Unexpected cteRest value."); 218 219 /* InputFieldReordering copies select fields from an input line to a new buffer. 220 * The buffer is reordered in the process. 221 */ 222 auto fieldReordering = new InputFieldReordering!char(cmdopt.fields); 223 224 /* Fields not on the --fields list are added to a separate buffer so they can be 225 * output as a group (the --rest option). This is done using an 'Appender', which 226 * is faster than the ~= operator. The Appender is passed a GC allocated buffer 227 * that grows as needed and is reused for each line. Typically it'll grow only 228 * on the first line. 229 */ 230 static if (cteRest != CTERestLocation.none) 231 { 232 auto leftOverFieldsAppender = appender!(char[][]); 233 } 234 235 /* BufferedOutputRange (from tsvutils.d) is a performance improvement over writing 236 * directly to stdout. 237 */ 238 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 239 240 /* Read each input file (or stdin) and iterate over each line. A filename of "-" is 241 * interpreted as stdin, common behavior for unix command line tools. 242 */ 243 foreach (fileNum, filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 244 { 245 auto inputStream = (filename == "-") ? stdin : filename.File(); 246 foreach (lineNum, line; inputStream.bufferedByLine.enumerate(1)) 247 { 248 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum); 249 250 if (lineNum == 1 && fileNum > 0 && cmdopt.hasHeader) 251 { 252 continue; // Drop the header line from all but the first file. 253 } 254 static if (cteRest != CTERestLocation.none) 255 { 256 leftOverFieldsAppender.clear; 257 } 258 fieldReordering.initNewLine; 259 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 260 { 261 static if (cteRest == CTERestLocation.none) 262 { 263 fieldReordering.processNextField(fieldIndex, fieldValue); 264 if (fieldReordering.allFieldsFilled) break; 265 } 266 else 267 { 268 immutable numMatched = fieldReordering.processNextField(fieldIndex, fieldValue); 269 if (numMatched == 0) leftOverFieldsAppender.put(fieldValue); 270 } 271 } 272 // Finished with all fields in the line. 273 if (!fieldReordering.allFieldsFilled) 274 { 275 throw new Exception( 276 format("Not enough fields in line. File: %s, Line: %s", 277 (filename == "-") ? "Standard Input" : filename, lineNum)); 278 } 279 280 // Write the re-ordered line. 281 282 static if (cteRest == CTERestLocation.first) 283 { 284 if (leftOverFieldsAppender.data.length > 0) 285 { 286 bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim); 287 bufferedOutput.append(cmdopt.delim); 288 } 289 } 290 291 bufferedOutput.joinAppend(fieldReordering.outputFields, cmdopt.delim); 292 293 static if (cteRest == CTERestLocation.last) 294 { 295 if (leftOverFieldsAppender.data.length > 0) 296 { 297 bufferedOutput.append(cmdopt.delim); 298 bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim); 299 } 300 } 301 302 bufferedOutput.appendln; 303 } 304 } 305 }