1 /** 2 A variant of the unix 'cut' program, with the ability to reorder fields. 3 4 tsv-select is a variation on the Unix 'cut' utility, with the added ability to reorder 5 fields. Lines are read from files or standard input and split on a delimiter character. 6 Fields are written to standard output in the order listed. Fields can be listed more 7 than once, and fields not listed can be written out as a group. 8 9 This program is intended both as a useful utility and a D programming language example. 10 Functionality and constructs used include command line argument processing, file I/O, 11 exception handling, ranges, tuples and strings, templates, universal function call syntax 12 (UFCS), lambdas and functional programming constructs. Comments are more verbose than 13 typical to shed light on D programming constructs, but not to the level of a tutorial. 14 15 Copyright (c) 2015-2020, eBay Inc. 16 Initially written by Jon Degenhardt 17 18 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 19 */ 20 21 module tsv_utils.tsv_select; // Module name defaults to file name, but hyphens not allowed, so set it here. 22 23 // Imports used by multiple routines. Others imports made in local context. 24 import std.stdio; 25 import std.typecons : tuple, Tuple; 26 27 // 'Heredoc' style help text. When printed it is followed by a getopt formatted option list. 28 immutable helpText = q"EOS 29 Synopsis: tsv-select [options] [file...] 30 31 tsv-select reads files or standard input and writes selected fields to 32 standard output. Fields are written in the order listed. This is similar 33 to Unix 'cut', but with the ability to reorder fields. 34 35 Fields numbers start with one. They are comma separated and ranges can be 36 used. Fields can be repeated, and fields not included in the '--f|fields' 37 option can be selected as a group using '--r|rest'. Fields can be dropped 38 using '--e|exclude'. Multiple files with header lines can be managed with 39 '--H|header', which retains the header of the first file only. 40 41 Examples: 42 43 # Output fields 2 and 1, in that order 44 tsv-select -f 2,1 data.tsv 45 46 # Drop the first field, keep everything else. 47 tsv-select --exclude 1 file.tsv 48 49 # Move the first field to the end 50 tsv-select -f 1 --rest first data.tsv 51 52 # Multiple files with header lines. Keep only one header. 53 tsv-select data*.tsv -H --fields 1,2,4-7,14 54 55 Use '--help-verbose' for detailed information. 56 57 Options: 58 EOS"; 59 60 immutable helpTextVerbose = q"EOS 61 Synopsis: tsv-select [options] [file...] 62 63 tsv-select reads files or standard input and writes selected fields to 64 standard output. Fields are written in the order listed. This is similar 65 to Unix 'cut', but with the ability to reorder fields. 66 67 Fields numbers start with one. They are comma separated and ranges can be 68 used. Fields can be repeated, and fields not included in the '--f|fields' 69 option can be selected as a group using '--r|rest'. Use '--H|header' to 70 retain the header line from only the first file. 71 72 Fields can be excluded using '--e|exclude'. All fields not excluded are 73 output. '--f|fields' and '--r|rest' can be used with '--e|exclude' to 74 reorder non-excluded fields. 75 76 Examples: 77 78 # Keep the first field from two files 79 tsv-select -f 1 file1.tsv file2.tsv 80 81 # Keep fields 1 and 2, retain the header from the first file 82 tsv-select -H -f 1,2 file1.tsv file2.tsv 83 84 # Field reordering and field ranges 85 tsv-select -f 3,2,1 file.tsv 86 tsv-select -f 1,4-7,11 file.tsv 87 tsv-select -f 1,7-4,11 file.tsv 88 89 # Repeating fields 90 tsv-select -f 1,2,1 file.tsv 91 tsv-select -f 1-3,3-1 file.tsv 92 93 # Move field 5 to the front 94 tsv-select -f 5 --rest last file.tsv 95 96 # Move fields 4 and 5 to the end 97 tsv-select -f 4,5 --rest first file.tsv 98 99 # Drop the first field, keep everything else 100 tsv-select --exclude 1 file.tsv 101 102 # Move field 2 to the front and drop fields 10-15 103 tsv-select -f 2 -e 10-15 file.tsv 104 105 # Move field 2 to the end, dropping fields 10-15 106 tsv-select -f 2 -rest first -e 10-15 file.tsv 107 108 Notes: 109 * One of '--f|fields' or '--e|exclude' is required. 110 * Fields specified by '--f|fields' and '--e|exclude' cannot overlap. 111 * When '--f|fields' and '--e|exclude' are used together, the effect is to 112 specify '--rest last'. This can be overridden by using '--rest first'. 113 * Each input line must be long enough to contain all fields specified with 114 '--f|fields'. This is not necessary for '--e|exclude' fields. 115 116 Options: 117 EOS"; 118 119 /** Container for command line options. 120 */ 121 struct TsvSelectOptions 122 { 123 // The allowed values for the --rest option. 124 enum RestOption { none, first, last}; 125 126 string programName; /// Program name 127 bool helpVerbose = false; /// --help-verbose 128 bool hasHeader = false; /// --H|header 129 char delim = '\t'; /// --d|delimiter 130 size_t[] fields; /// --f|fields 131 size_t[] excludedFieldsArg; /// --e|exclude 132 RestOption restArg; /// --rest first|last (none is hidden default) 133 bool versionWanted = false; /// --V|version 134 bool[] excludedFieldsTable; /// Derived. Lookup table for excluded fields. 135 136 /** Process command line arguments (getopt cover). 137 * 138 * processArgs calls getopt to process command line arguments. It does any additional 139 * validation and parameter derivations needed. A tuple is returned. First value is 140 * true if command line arguments were successfully processed and execution should 141 * continue, or false if an error occurred or the user asked for help. If false, the 142 * second value is the appropriate exit code (0 or 1). 143 * 144 * Returning true (execution continues) means args have been validated and derived 145 * values calculated. In addition, field indices have been converted to zero-based. 146 */ 147 auto processArgs (ref string[] cmdArgs) 148 { 149 import std.algorithm : any, each, maxElement; 150 import std.format : format; 151 import std.getopt; 152 import std.path : baseName, stripExtension; 153 import std.typecons : Yes, No; 154 import tsv_utils.common.utils : makeFieldListOptionHandler; 155 156 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 157 158 try 159 { 160 arraySep = ","; // Use comma to separate values in command line options 161 auto r = getopt( 162 cmdArgs, 163 "help-verbose", " Print more detailed help.", &helpVerbose, 164 165 std.getopt.config.caseSensitive, 166 "H|header", " Treat the first line of each file as a header.", &hasHeader, 167 std.getopt.config.caseInsensitive, 168 169 "f|fields", "<field-list> Fields to retain. Fields are output in the order listed.", 170 fields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex), 171 172 "e|exclude", "<field-list> Fields to exclude.", 173 excludedFieldsArg.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex), 174 175 "r|rest", "first|last Output location for fields not included in '--f|fields'.", &restArg, 176 "d|delimiter", "CHR Character to use as field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 177 std.getopt.config.caseSensitive, 178 "V|version", " Print version information and exit.", &versionWanted, 179 std.getopt.config.caseInsensitive, 180 ); 181 182 if (r.helpWanted) 183 { 184 defaultGetoptPrinter(helpText, r.options); 185 return tuple(false, 0); 186 } 187 else if (helpVerbose) 188 { 189 defaultGetoptPrinter(helpTextVerbose, r.options); 190 return tuple(false, 0); 191 } 192 else if (versionWanted) 193 { 194 import tsv_utils.common.tsvutils_version; 195 writeln(tsvutilsVersionNotice("tsv-select")); 196 return tuple(false, 0); 197 } 198 199 /* 200 * Consistency checks and derivations. 201 */ 202 203 if (fields.length == 0 && excludedFieldsArg.length == 0) 204 { 205 throw new Exception("One of '--f|fields' or '--e|exclude' is required."); 206 } 207 208 if (excludedFieldsArg.length > 0) 209 { 210 /* Make sure selected and excluded fields do not overlap. */ 211 foreach (e; excludedFieldsArg) 212 { 213 foreach (f; fields) 214 { 215 if (e == f) 216 { 217 throw new Exception("'--f|fields' and '--e|exclude' have overlapping fields."); 218 } 219 } 220 } 221 222 /* '--exclude' changes '--rest' default to 'last'. */ 223 if (restArg == RestOption.none) restArg = RestOption.last; 224 225 /* Build the excluded field lookup table. 226 * 227 * Note: Users won't have any reason to expect memory is allocated based 228 * on the max field number. However, users might pick arbitrarily large 229 * numbers when trimming fields. So, limit the max field number to something 230 * big but reasonable (more than 1 million). The limit can be raised if use 231 * cases arise. 232 */ 233 size_t maxExcludedField = excludedFieldsArg.maxElement; 234 size_t maxAllowedExcludedField = 1024 * 1024; 235 236 if (maxExcludedField >= maxAllowedExcludedField) 237 { 238 throw new Exception(format("Maximum allowed '--e|exclude' field number is %d.", 239 maxAllowedExcludedField)); 240 } 241 242 excludedFieldsTable.length = maxExcludedField + 1; // Initialized to false 243 foreach (e; excludedFieldsArg) excludedFieldsTable[e] = true; 244 } 245 } 246 catch (Exception exc) 247 { 248 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 249 return tuple(false, 1); 250 } 251 return tuple(true, 0); 252 } 253 } 254 255 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 256 257 /** Main program. 258 */ 259 int main(string[] cmdArgs) 260 { 261 /* When running in DMD code coverage mode, turn on report merging. */ 262 version(D_Coverage) version(DigitalMars) 263 { 264 import core.runtime : dmd_coverSetMerge; 265 dmd_coverSetMerge(true); 266 } 267 268 TsvSelectOptions cmdopt; 269 const r = cmdopt.processArgs(cmdArgs); 270 if (!r[0]) return r[1]; 271 version(LDC_Profile) 272 { 273 import ldc.profile : resetAll; 274 resetAll(); 275 } 276 try 277 { 278 /* Invoke the tsvSelect template matching the --rest option chosen. Option args 279 * are removed by command line processing (getopt). The program name and any files 280 * remain. Pass the files to tsvSelect. 281 */ 282 final switch (cmdopt.restArg) 283 { 284 case TsvSelectOptions.RestOption.none: 285 tsvSelect!(RestLocation.none)(cmdopt, cmdArgs[1..$]); 286 break; 287 case TsvSelectOptions.RestOption.first: 288 tsvSelect!(RestLocation.first)(cmdopt, cmdArgs[1..$]); 289 break; 290 case TsvSelectOptions.RestOption.last: 291 tsvSelect!(RestLocation.last)(cmdopt, cmdArgs[1..$]); 292 break; 293 } 294 } 295 catch (Exception exc) 296 { 297 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 298 return 1; 299 } 300 301 return 0; 302 } 303 304 // tsvSelect 305 306 /** Enumeration of the different specializations of the tsvSelect template. 307 * 308 * RestLocation is logically equivalent to the TsvSelectOptions.RestOption enum. It 309 * is used by main to choose the appropriate tsvSelect template instantiation to call. It 310 * is distinct from the TsvSelectOptions enum to separate it from the end-user UI. The 311 * TsvSelectOptions version specifies the text of allowed values in command line arguments. 312 */ 313 enum RestLocation { none, first, last }; 314 315 /** tsvSelect does the primary work of the tsv-select program. 316 * 317 * Input is read line by line, extracting the listed fields and writing them out in the order 318 * specified. An exception is thrown on error. 319 * 320 * This function is templatized with instantiations for the different --rest options. This 321 * avoids repeatedly running the same if-tests inside the inner loop. The main function 322 * instantiates this function three times, once for each of the --rest options. It results 323 * in a larger program, but is faster. Run-time improvements of 25% were measured compared 324 * to the non-templatized version. (Note: 'cte' stands for 'compile time evaluation'.) 325 */ 326 void tsvSelect(RestLocation rest)(const TsvSelectOptions cmdopt, const string[] inputFiles) 327 { 328 import tsv_utils.common.utils: BufferedOutputRange, bufferedByLine, InputFieldReordering, throwIfWindowsNewlineOnUnix; 329 import std.algorithm: splitter; 330 import std.format: format; 331 import std.range; 332 333 // Ensure the correct template instantiation was called. 334 static if (rest == RestLocation.none) 335 assert(cmdopt.restArg == TsvSelectOptions.RestOption.none); 336 else static if (rest == RestLocation.first) 337 assert(cmdopt.restArg == TsvSelectOptions.RestOption.first); 338 else static if (rest == RestLocation.last) 339 assert(cmdopt.restArg == TsvSelectOptions.RestOption.last); 340 else 341 static assert(false, "rest template argument does not match cmdopt.restArg."); 342 343 /* The algorithm here assumes RestOption.none is not used with --exclude-fields. */ 344 assert(cmdopt.excludedFieldsTable.length == 0 || rest != RestLocation.none); 345 346 /* InputFieldReordering copies select fields from an input line to a new buffer. 347 * The buffer is reordered in the process. 348 */ 349 auto fieldReordering = new InputFieldReordering!char(cmdopt.fields); 350 351 /* Fields not on the --fields list are added to a separate buffer so they can be 352 * output as a group (the --rest option). This is done using an 'Appender', which 353 * is faster than the ~= operator. The Appender is passed a GC allocated buffer 354 * that grows as needed and is reused for each line. Typically it'll grow only 355 * on the first line. 356 */ 357 static if (rest != RestLocation.none) 358 { 359 auto leftOverFieldsAppender = appender!(char[][]); 360 } 361 362 /* BufferedOutputRange (from tsvutils.d) is a performance improvement over writing 363 * directly to stdout. 364 */ 365 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 366 367 /* Read each input file (or stdin) and iterate over each line. A filename of "-" is 368 * interpreted as stdin, common behavior for unix command line tools. 369 */ 370 foreach (fileNum, filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 371 { 372 auto inputStream = (filename == "-") ? stdin : filename.File(); 373 foreach (lineNum, line; inputStream.bufferedByLine.enumerate(1)) 374 { 375 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum); 376 377 if (lineNum == 1 && fileNum > 0 && cmdopt.hasHeader) 378 { 379 continue; // Drop the header line from all but the first file. 380 } 381 382 static if (rest != RestLocation.none) 383 { 384 leftOverFieldsAppender.clear; 385 386 /* Track the field location in the line. This enables bulk appending 387 * after the last specified field has been processed. 388 */ 389 size_t nextFieldStart = 0; 390 } 391 392 fieldReordering.initNewLine; 393 394 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 395 { 396 static if (rest == RestLocation.none) 397 { 398 fieldReordering.processNextField(fieldIndex, fieldValue); 399 if (fieldReordering.allFieldsFilled) break; 400 } 401 else 402 { 403 /* Processing with 'rest' fields. States: 404 * - Excluded fields and specified fields remain 405 * - Only specified fields remain 406 * - Only excluded fields remain 407 */ 408 409 nextFieldStart += fieldValue.length + 1; 410 bool excludedFieldsRemain = fieldIndex < cmdopt.excludedFieldsTable.length; 411 immutable isExcluded = excludedFieldsRemain && cmdopt.excludedFieldsTable[fieldIndex]; 412 413 if (!isExcluded) 414 { 415 immutable numMatched = fieldReordering.processNextField(fieldIndex, fieldValue); 416 417 if (numMatched == 0) leftOverFieldsAppender.put(fieldValue); 418 } 419 else if (fieldIndex + 1 == cmdopt.excludedFieldsTable.length) 420 { 421 excludedFieldsRemain = false; 422 } 423 424 if (fieldReordering.allFieldsFilled && !excludedFieldsRemain) 425 { 426 /* Processed all specified fields. Bulk append any fields 427 * remaining on the line. Cases: 428 * - Current field is last field: 429 */ 430 if (nextFieldStart <= line.length) 431 { 432 leftOverFieldsAppender.put(line[nextFieldStart .. $]); 433 } 434 435 break; 436 } 437 } 438 } 439 440 // Finished with all fields in the line. 441 if (!fieldReordering.allFieldsFilled) 442 { 443 throw new Exception( 444 format("Not enough fields in line. File: %s, Line: %s", 445 (filename == "-") ? "Standard Input" : filename, lineNum)); 446 } 447 448 // Write the re-ordered line. 449 450 static if (rest == RestLocation.first) 451 { 452 if (leftOverFieldsAppender.data.length > 0) 453 { 454 bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim); 455 if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim); 456 } 457 } 458 459 bufferedOutput.joinAppend(fieldReordering.outputFields, cmdopt.delim); 460 461 static if (rest == RestLocation.last) 462 { 463 if (leftOverFieldsAppender.data.length > 0) 464 { 465 if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim); 466 bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim); 467 } 468 } 469 470 bufferedOutput.appendln; 471 } 472 } 473 }