1 /** 2 A variant of the unix 'cut' program, with the ability to reorder fields. 3 4 tsv-select is a variation on the Unix 'cut' utility, with the added ability to reorder 5 fields. Lines are read from files or standard input and split on a delimiter character. 6 Fields are written to standard output in the order listed. Fields can be listed more 7 than once, and fields not listed can be written out as a group. 8 9 This program is intended both as a useful utility and a D programming language example. 10 Functionality and constructs used include command line argument processing, file I/O, 11 exception handling, ranges, tuples and strings, templates, universal function call syntax 12 (UFCS), lambdas and functional programming constructs. Comments are more verbose than 13 typical to shed light on D programming constructs, but not to the level of a tutorial. 14 15 Copyright (c) 2015-2020, eBay Inc. 16 Initially written by Jon Degenhardt 17 18 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 19 */ 20 21 module tsv_utils.tsv_select; // Module name defaults to file name, but hyphens not allowed, so set it here. 22 23 // Imports used by multiple routines. Others imports made in local context. 24 import std.exception : enforce; 25 import std.range; 26 import std.stdio; 27 import std.typecons : tuple, Tuple; 28 29 // 'Heredoc' style help text. When printed it is followed by a getopt formatted option list. 30 immutable helpText = q"EOS 31 Synopsis: tsv-select [options] [file...] 32 33 tsv-select reads files or standard input and writes selected fields to 34 standard output. Fields are written in the order listed. This is similar 35 to Unix 'cut', but with the ability to reorder fields. 36 37 Fields can be specified by field number or, for files with header lines, 38 by field name. Use '--H|header' to enable selection by name. This also 39 manages header lines from multiple files, retaining only the first header. 40 41 Field numbers start with one. The field list is comma separated. Ranges 42 can be used, and wildcards can be used when specifying fields by name. 43 44 Fields can be dropped using '--e|exclude'. Fields not included in the 45 '--f|fields' option can be selected as a group using '--r|rest'. 46 47 Examples: 48 49 # Selecting fields. Output is in the order listed 50 tsv-select -H date,time file.tsv 51 tsv-select -f 2,1 file.tsv 52 tsv-select -f 5-7,2,9-11 53 tsv-select -H -f '*_date' file.tsv 54 55 # Dropping fields 56 tsv-select --exclude 1 file.tsv 57 tsv-select -H -e date,time file.tsv 58 59 # Move fields to the front or the back 60 tsv-select -f 1 --rest first file.tsv # Move field 1 to the end 61 tsv-select -H -f date --rest last # Move 'date' field to the front 62 63 # Read multiple files, keep the header from only the first 64 tsv-select data*.tsv -H --fields 1,2,4-7,14 65 66 Use '--help-verbose' for detailed information. Use '--help-fields' for 67 details about field lists and field names. 68 69 Options: 70 EOS"; 71 72 immutable helpTextVerbose = q"EOS 73 Synopsis: tsv-select [options] [file...] 74 75 tsv-select reads files or standard input and writes selected fields to 76 standard output. Fields are written in the order listed. This is similar 77 to Unix 'cut', but with the ability to reorder fields. 78 79 Fields can be specified by field number or, for files with header lines, 80 by field name. Use '--H|header' to enable selection by name. This also 81 manages header lines from multiple files, retaining only the first header. 82 83 Field numbers start with one. The field list is comma separated. Fields 84 can be repeated and ranges can be used. Wildcards can be used when 85 specifying fields by name, and escapes can be used to specify fields names 86 containing special characters. Run '--help-fields' for details. 87 88 Fields can be excluded using '--e|exclude'. All fields not excluded are 89 output. Fields not included in the '--f|fields' option can be selected as 90 a group using '--r|rest'. '--f|fields' and '--r|rest' can be used with 91 '--e|exclude' to reorder non-excluded fields. 92 93 Examples: 94 95 # Keep the first field from two files 96 tsv-select -f 1 file1.tsv file2.tsv 97 98 # Keep fields 1 and 2, retaining the header from only the first file 99 tsv-select -H -f 1,2 file1.tsv file2.tsv 100 101 # Keep the 'time' field 102 tsv-select -H -f time file1.tsv 103 104 # Keep all fields ending '_date' or '_time' 105 tsv-select -H -f '*_date,*_time' file.tsv 106 107 # Drop all the '*_time' fields 108 tsv-select -H --exclude '*_time' file.tsv 109 110 # Field reordering and field ranges 111 tsv-select -f 3,2,1 file.tsv 112 tsv-select -f 1,4-7,11 file.tsv 113 tsv-select -f 1,7-4,11 file.tsv 114 115 # Repeating fields 116 tsv-select -f 1,2,1 file.tsv 117 tsv-select -f 1-3,3-1 file.tsv 118 119 # Move fields to the front 120 tsv-select -f 5 --rest last file.tsv 121 tsv-select -H -f Date,Time --rest last file.tsv 122 123 # Move fields to the end 124 tsv-select -f 4,5 --rest first file.tsv 125 tsv-select -f '*_time' --rest first file.tsv 126 127 # Move field 2 to the front and drop fields 10-15 128 tsv-select -f 2 -e 10-15 file.tsv 129 130 # Move field 2 to the end, dropping fields 10-15 131 tsv-select -f 2 -rest first -e 10-15 file.tsv 132 133 Use '--help-fields' for detailed help on field lists. 134 135 Notes: 136 * One of '--f|fields' or '--e|exclude' is required. 137 * Fields specified by '--f|fields' and '--e|exclude' cannot overlap. 138 * When '--f|fields' and '--e|exclude' are used together, the effect is to 139 specify '--rest last'. This can be overridden by using '--rest first'. 140 * Each input line must be long enough to contain all fields specified 141 with '--f|fields'. This is not necessary for '--e|exclude' fields. 142 * Specifying names of fields containing special characters may require 143 escaping the special characters. See '--help-fields' for details. 144 145 Options: 146 EOS"; 147 148 /** Container for command line options. 149 */ 150 struct TsvSelectOptions 151 { 152 import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange; 153 154 // The allowed values for the --rest option. 155 enum RestOption { none, first, last}; 156 157 string programName; /// Program name 158 ByLineSourceRange!() inputSources; /// Input Files 159 bool hasHeader = false; /// --H|header 160 char delim = '\t'; /// --d|delimiter 161 RestOption restArg; /// --rest first|last (none is hidden default) 162 size_t[] fields; /// Derived from --f|fields 163 bool[] excludedFieldsTable; /// Derived. Lookup table for excluded fields. 164 165 /** Process command line arguments (getopt cover). 166 * 167 * processArgs calls getopt to process command line arguments. It does any additional 168 * validation and parameter derivations needed. A tuple is returned. First value is 169 * true if command line arguments were successfully processed and execution should 170 * continue, or false if an error occurred or the user asked for help. If false, the 171 * second value is the appropriate exit code (0 or 1). 172 * 173 * Returning true (execution continues) means args have been validated and derived 174 * values calculated. In addition, field indices have been converted to zero-based. 175 */ 176 auto processArgs (ref string[] cmdArgs) 177 { 178 import std.algorithm : any, each, maxElement; 179 import std.array : split; 180 import std.conv : to; 181 import std.format : format; 182 import std.getopt; 183 import std.path : baseName, stripExtension; 184 import std.typecons : Yes, No; 185 import tsv_utils.common.fieldlist; 186 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 187 188 bool helpVerbose = false; // --help-verbose 189 bool helpFields = false; // --help-fields 190 bool versionWanted = false; // --V|version 191 string fieldsArg; // --f|fields 192 string excludedFieldsArg; // --e|exclude 193 194 string fieldsOptionString = "f|fields"; 195 string excludedFieldsOptionString = "e|exclude"; 196 197 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 198 199 try 200 { 201 arraySep = ","; // Use comma to separate values in command line options 202 auto r = getopt( 203 cmdArgs, 204 "help-verbose", 205 " Print more detailed help.", 206 &helpVerbose, 207 208 "help-fields", 209 " Print help on specifying fields.", 210 &helpFields, 211 212 std.getopt.config.caseSensitive, 213 "H|header", 214 " Treat the first line of each file as a header.", 215 &hasHeader, 216 std.getopt.config.caseInsensitive, 217 218 fieldsOptionString, 219 "<field-list> Fields to retain. Fields are output in the order listed.", 220 &fieldsArg, 221 222 excludedFieldsOptionString, 223 "<field-list> Fields to exclude.", 224 &excludedFieldsArg, 225 226 "r|rest", 227 "first|last Output location for fields not included in '--f|fields'.", 228 &restArg, 229 230 "d|delimiter", 231 "CHR Character to use as field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", 232 &delim, 233 234 std.getopt.config.caseSensitive, 235 "V|version", 236 " Print version information and exit.", 237 &versionWanted, 238 std.getopt.config.caseInsensitive, 239 ); 240 241 if (r.helpWanted) 242 { 243 defaultGetoptPrinter(helpText, r.options); 244 return tuple(false, 0); 245 } 246 else if (helpVerbose) 247 { 248 defaultGetoptPrinter(helpTextVerbose, r.options); 249 return tuple(false, 0); 250 } 251 else if (helpFields) 252 { 253 writeln(fieldListHelpText); 254 return tuple(false, 0); 255 } 256 else if (versionWanted) 257 { 258 import tsv_utils.common.tsvutils_version; 259 writeln(tsvutilsVersionNotice("tsv-select")); 260 return tuple(false, 0); 261 } 262 263 /* Remaining command line args are files. Use standard input if files 264 * were not provided. Truncate cmdArgs to consume the arguments. 265 */ 266 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 267 cmdArgs.length = 1; 268 269 /* Validation and derivations - Do as much validation prior to header line 270 * processing as possible (avoids waiting on stdin). 271 * 272 * Note: fields and excludedFields depend on header line processing, but 273 * fieldsArg and excludedFieldsArg can be used to detect whether the 274 * command line argument was specified. 275 */ 276 277 enforce(!fieldsArg.empty || !excludedFieldsArg.empty, 278 "One of '--f|fields' or '--e|exclude' is required."); 279 280 string[] headerFields; 281 282 /* fieldListArgProcessing encapsulates the field list processing. It is 283 * called prior to reading the header line if headers are not being used, 284 * and after if headers are being used. 285 */ 286 void fieldListArgProcessing() 287 { 288 if (!fieldsArg.empty) 289 { 290 fields = fieldsArg 291 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)( 292 hasHeader, headerFields, fieldsOptionString) 293 .array; 294 } 295 296 size_t[] excludedFields; 297 298 if (!excludedFieldsArg.empty) 299 { 300 excludedFields = excludedFieldsArg 301 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)( 302 hasHeader, headerFields, excludedFieldsOptionString) 303 .array; 304 } 305 306 if (excludedFields.length > 0) 307 { 308 /* Make sure selected and excluded fields do not overlap. */ 309 foreach (e; excludedFields) 310 { 311 foreach (f; fields) 312 { 313 enforce(e != f, "'--f|fields' and '--e|exclude' have overlapping fields."); 314 } 315 } 316 317 /* '--exclude' changes '--rest' default to 'last'. */ 318 if (restArg == RestOption.none) restArg = RestOption.last; 319 320 /* Build the excluded field lookup table. 321 * 322 * Note: Users won't have any reason to expect memory is allocated based 323 * on the max field number. However, users might pick arbitrarily large 324 * numbers when trimming fields. So, limit the max field number to something 325 * big but reasonable (more than 1 million). The limit can be raised if use 326 * cases arise. 327 */ 328 size_t maxExcludedField = excludedFields.maxElement; 329 size_t maxAllowedExcludedField = 1024 * 1024; 330 331 enforce(maxExcludedField < maxAllowedExcludedField, 332 format("Maximum allowed '--e|exclude' field number is %d.", 333 maxAllowedExcludedField)); 334 335 excludedFieldsTable.length = maxExcludedField + 1; // Initialized to false 336 foreach (e; excludedFields) excludedFieldsTable[e] = true; 337 } 338 } 339 340 if (!hasHeader) fieldListArgProcessing(); 341 342 /* 343 * Create the byLineSourceRange and perform header line processing. 344 */ 345 inputSources = byLineSourceRange(filepaths); 346 347 if (hasHeader) 348 { 349 if (!inputSources.front.byLine.empty) 350 { 351 throwIfWindowsNewlineOnUnix(inputSources.front.byLine.front, inputSources.front.name, 1); 352 headerFields = inputSources.front.byLine.front.split(delim).to!(string[]); 353 } 354 355 fieldListArgProcessing(); 356 } 357 358 } 359 catch (Exception exc) 360 { 361 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 362 return tuple(false, 1); 363 } 364 return tuple(true, 0); 365 } 366 } 367 368 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 369 370 /** Main program. 371 */ 372 int main(string[] cmdArgs) 373 { 374 /* When running in DMD code coverage mode, turn on report merging. */ 375 version(D_Coverage) version(DigitalMars) 376 { 377 import core.runtime : dmd_coverSetMerge; 378 dmd_coverSetMerge(true); 379 } 380 381 TsvSelectOptions cmdopt; 382 const r = cmdopt.processArgs(cmdArgs); 383 if (!r[0]) return r[1]; 384 version(LDC_Profile) 385 { 386 import ldc.profile : resetAll; 387 resetAll(); 388 } 389 try 390 { 391 /* Invoke the tsvSelect template matching the --rest option chosen. Option args 392 * are removed by command line processing (getopt). The program name and any files 393 * remain. Pass the files to tsvSelect. 394 */ 395 final switch (cmdopt.restArg) 396 { 397 case TsvSelectOptions.RestOption.none: 398 tsvSelect!(RestLocation.none)(cmdopt); 399 break; 400 case TsvSelectOptions.RestOption.first: 401 tsvSelect!(RestLocation.first)(cmdopt); 402 break; 403 case TsvSelectOptions.RestOption.last: 404 tsvSelect!(RestLocation.last)(cmdopt); 405 break; 406 } 407 } 408 catch (Exception exc) 409 { 410 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 411 return 1; 412 } 413 414 return 0; 415 } 416 417 // tsvSelect 418 419 /** Enumeration of the different specializations of the tsvSelect template. 420 * 421 * RestLocation is logically equivalent to the TsvSelectOptions.RestOption enum. It 422 * is used by main to choose the appropriate tsvSelect template instantiation to call. It 423 * is distinct from the TsvSelectOptions enum to separate it from the end-user UI. The 424 * TsvSelectOptions version specifies the text of allowed values in command line arguments. 425 */ 426 enum RestLocation { none, first, last }; 427 428 /** tsvSelect does the primary work of the tsv-select program. 429 * 430 * Input is read line by line, extracting the listed fields and writing them out in the order 431 * specified. An exception is thrown on error. 432 * 433 * This function is templatized with instantiations for the different --rest options. This 434 * avoids repeatedly running the same if-tests inside the inner loop. The main function 435 * instantiates this function three times, once for each of the --rest options. It results 436 * in a larger program, but is faster. Run-time improvements of 25% were measured compared 437 * to the non-templatized version. (Note: 'cte' stands for 'compile time evaluation'.) 438 */ 439 440 void tsvSelect(RestLocation rest)(ref TsvSelectOptions cmdopt) 441 { 442 import tsv_utils.common.utils: BufferedOutputRange, ByLineSourceRange, 443 InputFieldReordering, throwIfWindowsNewlineOnUnix; 444 import std.algorithm: splitter; 445 import std.array : appender, Appender; 446 import std.format: format; 447 import std.range; 448 449 // Ensure the correct template instantiation was called. 450 static if (rest == RestLocation.none) 451 assert(cmdopt.restArg == TsvSelectOptions.RestOption.none); 452 else static if (rest == RestLocation.first) 453 assert(cmdopt.restArg == TsvSelectOptions.RestOption.first); 454 else static if (rest == RestLocation.last) 455 assert(cmdopt.restArg == TsvSelectOptions.RestOption.last); 456 else 457 static assert(false, "rest template argument does not match cmdopt.restArg."); 458 459 /* Check that the input files were setup as expected. Should at least have one 460 * input, stdin if nothing else, and newlines removed from the byLine range. 461 */ 462 assert(!cmdopt.inputSources.empty); 463 static assert(is(typeof(cmdopt.inputSources) == ByLineSourceRange!(No.keepTerminator))); 464 465 /* The algorithm here assumes RestOption.none is not used with --exclude-fields. */ 466 assert(cmdopt.excludedFieldsTable.length == 0 || rest != RestLocation.none); 467 468 /* InputFieldReordering copies select fields from an input line to a new buffer. 469 * The buffer is reordered in the process. 470 */ 471 auto fieldReordering = new InputFieldReordering!char(cmdopt.fields); 472 473 /* Fields not on the --fields list are added to a separate buffer so they can be 474 * output as a group (the --rest option). This is done using an 'Appender', which 475 * is faster than the ~= operator. The Appender is passed a GC allocated buffer 476 * that grows as needed and is reused for each line. Typically it'll grow only 477 * on the first line. 478 */ 479 static if (rest != RestLocation.none) 480 { 481 auto leftOverFieldsAppender = appender!(char[][]); 482 } 483 484 /* BufferedOutputRange (from common/utils.d) is a performance improvement over 485 * writing directly to stdout. 486 */ 487 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 488 489 /* Read each input file (or stdin) and iterate over each line. 490 */ 491 foreach (fileNum, inputStream; cmdopt.inputSources.enumerate) 492 { 493 foreach (lineNum, line; inputStream.byLine.enumerate(1)) 494 { 495 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum); 496 497 if (lineNum == 1 && fileNum > 0 && cmdopt.hasHeader) 498 { 499 continue; // Drop the header line from all but the first file. 500 } 501 502 static if (rest != RestLocation.none) 503 { 504 leftOverFieldsAppender.clear; 505 506 /* Track the field location in the line. This enables bulk appending 507 * after the last specified field has been processed. 508 */ 509 size_t nextFieldStart = 0; 510 } 511 512 fieldReordering.initNewLine; 513 514 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 515 { 516 static if (rest == RestLocation.none) 517 { 518 fieldReordering.processNextField(fieldIndex, fieldValue); 519 if (fieldReordering.allFieldsFilled) break; 520 } 521 else 522 { 523 /* Processing with 'rest' fields. States: 524 * - Excluded fields and specified fields remain 525 * - Only specified fields remain 526 * - Only excluded fields remain 527 */ 528 529 nextFieldStart += fieldValue.length + 1; 530 bool excludedFieldsRemain = fieldIndex < cmdopt.excludedFieldsTable.length; 531 immutable isExcluded = excludedFieldsRemain && cmdopt.excludedFieldsTable[fieldIndex]; 532 533 if (!isExcluded) 534 { 535 immutable numMatched = fieldReordering.processNextField(fieldIndex, fieldValue); 536 537 if (numMatched == 0) leftOverFieldsAppender.put(fieldValue); 538 } 539 else if (fieldIndex + 1 == cmdopt.excludedFieldsTable.length) 540 { 541 excludedFieldsRemain = false; 542 } 543 544 if (fieldReordering.allFieldsFilled && !excludedFieldsRemain) 545 { 546 /* Processed all specified fields. Bulk append any fields 547 * remaining on the line. Cases: 548 * - Current field is last field: 549 */ 550 if (nextFieldStart <= line.length) 551 { 552 leftOverFieldsAppender.put(line[nextFieldStart .. $]); 553 } 554 555 break; 556 } 557 } 558 } 559 560 // Finished with all fields in the line. 561 enforce(fieldReordering.allFieldsFilled, 562 format("Not enough fields in line. File: %s, Line: %s", 563 inputStream.name, lineNum)); 564 565 // Write the re-ordered line. 566 567 static if (rest == RestLocation.first) 568 { 569 if (leftOverFieldsAppender.data.length > 0) 570 { 571 bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim); 572 if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim); 573 } 574 } 575 576 bufferedOutput.joinAppend(fieldReordering.outputFields, cmdopt.delim); 577 578 static if (rest == RestLocation.last) 579 { 580 if (leftOverFieldsAppender.data.length > 0) 581 { 582 if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim); 583 bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim); 584 } 585 } 586 587 bufferedOutput.appendln; 588 589 /* Send the first line of the first file immediately. This helps detect 590 * errors quickly in multi-stage unix pipelines. Note that tsv-select may 591 * have been sent one line from an upstream process, usually a header line. 592 */ 593 if (lineNum == 1 && fileNum == 0) bufferedOutput.flush; 594 } 595 } 596 }