1 /** 2 A variant of the unix 'cut' program, with the ability to reorder fields. 3 4 tsv-select is a variation on the Unix 'cut' utility, with the added ability to reorder 5 fields. Lines are read from files or standard input and split on a delimiter character. 6 Fields are written to standard output in the order listed. Fields can be listed more 7 than once, and fields not listed can be written out as a group. 8 9 This program is intended both as a useful utility and a D programming language example. 10 Functionality and constructs used include command line argument processing, file I/O, 11 exception handling, ranges, tuples and strings, templates, universal function call syntax 12 (UFCS), lambdas and functional programming constructs. Comments are more verbose than 13 typical to shed light on D programming constructs, but not to the level of a tutorial. 14 15 Copyright (c) 2015-2021, eBay Inc. 16 Initially written by Jon Degenhardt 17 18 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 19 */ 20 21 module tsv_utils.tsv_select; // Module name defaults to file name, but hyphens not allowed, so set it here. 22 23 // Imports used by multiple routines. Others imports made in local context. 24 import std.exception : enforce; 25 import std.range; 26 import std.stdio; 27 import std.typecons : tuple, Tuple; 28 29 // 'Heredoc' style help text. When printed it is followed by a getopt formatted option list. 30 immutable helpText = q"EOS 31 Synopsis: tsv-select [options] [file...] 32 33 tsv-select reads files or standard input and writes selected fields to 34 standard output. Fields are written in the order listed. This is similar 35 to Unix 'cut', but with the ability to reorder fields. 36 37 Fields can be specified by field number or, for files with header lines, 38 by field name. Use '--H|header' to enable selection by name. This also 39 manages header lines from multiple files, retaining only the first header. 40 41 Field numbers start with one. The field list is comma separated. Ranges 42 can be used, and wildcards can be used when specifying fields by name. 43 44 Fields can be dropped using '--e|exclude'. Fields not included in the 45 '--f|fields' option can be selected as a group using '--r|rest'. 46 47 Examples: 48 49 # Selecting fields. Output is in the order listed 50 tsv-select -H date,time file.tsv 51 tsv-select -f 2,1 file.tsv 52 tsv-select -f 5-7,2,9-11 53 tsv-select -H -f '*_date' file.tsv 54 55 # Dropping fields 56 tsv-select --exclude 1 file.tsv 57 tsv-select -H -e date,time file.tsv 58 59 # Move fields to the front or the back 60 tsv-select -f 1 --rest first file.tsv # Move field 1 to the end 61 tsv-select -H -f date --rest last # Move 'date' field to the front 62 63 # Read multiple files, keep the header from only the first 64 tsv-select data*.tsv -H --fields 1,2,4-7,14 65 66 Use '--help-verbose' for detailed information. Use '--help-fields' for 67 details about field lists and field names. 68 69 Options: 70 EOS"; 71 72 immutable helpTextVerbose = q"EOS 73 Synopsis: tsv-select [options] [file...] 74 75 tsv-select reads files or standard input and writes selected fields to 76 standard output. Fields are written in the order listed. This is similar 77 to Unix 'cut', but with the ability to reorder fields. 78 79 Fields can be specified by field number or, for files with header lines, 80 by field name. Use '--H|header' to enable selection by name. This also 81 manages header lines from multiple files, retaining only the first header. 82 83 Field numbers start with one. The field list is comma separated. Fields 84 can be repeated and ranges can be used. Wildcards can be used when 85 specifying fields by name, and escapes can be used to specify fields names 86 containing special characters. Run '--help-fields' for details. 87 88 Fields can be excluded using '--e|exclude'. All fields not excluded are 89 output. Fields not included in the '--f|fields' option can be selected as 90 a group using '--r|rest'. '--f|fields' and '--r|rest' can be used with 91 '--e|exclude' to reorder non-excluded fields. 92 93 Examples: 94 95 # Keep the first field from two files 96 tsv-select -f 1 file1.tsv file2.tsv 97 98 # Keep fields 1 and 2, retaining the header from only the first file 99 tsv-select -H -f 1,2 file1.tsv file2.tsv 100 101 # Keep the 'time' field 102 tsv-select -H -f time file1.tsv 103 104 # Keep all fields ending '_date' or '_time' 105 tsv-select -H -f '*_date,*_time' file.tsv 106 107 # Drop all the '*_time' fields 108 tsv-select -H --exclude '*_time' file.tsv 109 110 # Field reordering and field ranges 111 tsv-select -f 3,2,1 file.tsv 112 tsv-select -f 1,4-7,11 file.tsv 113 tsv-select -f 1,7-4,11 file.tsv 114 115 # Repeating fields 116 tsv-select -f 1,2,1 file.tsv 117 tsv-select -f 1-3,3-1 file.tsv 118 119 # Move fields to the front 120 tsv-select -f 5 --rest last file.tsv 121 tsv-select -H -f Date,Time --rest last file.tsv 122 123 # Move fields to the end 124 tsv-select -f 4,5 --rest first file.tsv 125 tsv-select -f '*_time' --rest first file.tsv 126 127 # Move field 2 to the front and drop fields 10-15 128 tsv-select -f 2 -e 10-15 file.tsv 129 130 # Move field 2 to the end, dropping fields 10-15 131 tsv-select -f 2 -rest first -e 10-15 file.tsv 132 133 Use '--help-fields' for detailed help on field lists. 134 135 Notes: 136 * One of '--f|fields' or '--e|exclude' is required. 137 * Fields specified by '--f|fields' and '--e|exclude' cannot overlap. 138 * When '--f|fields' and '--e|exclude' are used together, the effect is to 139 specify '--rest last'. This can be overridden by using '--rest first'. 140 * Each input line must be long enough to contain all fields specified 141 with '--f|fields'. This is not necessary for '--e|exclude' fields. 142 * Specifying names of fields containing special characters may require 143 escaping the special characters. See '--help-fields' for details. 144 * Output is buffered by default to improve performance. Use 145 '--line-buffered' to have each line immediately written out. 146 147 Options: 148 EOS"; 149 150 /** Container for command line options. 151 */ 152 struct TsvSelectOptions 153 { 154 import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange, LineBuffered, 155 ReadHeader; 156 157 // The allowed values for the --rest option. 158 enum RestOption { none, first, last}; 159 160 string programName; /// Program name 161 ByLineSourceRange!() inputSources; /// Input Files 162 bool hasHeader = false; /// --H|header 163 char delim = '\t'; /// --d|delimiter 164 bool lineBuffered = false; /// --line-buffered 165 RestOption restArg; /// --rest first|last (none is hidden default) 166 size_t[] fields; /// Derived from --f|fields 167 bool[] excludedFieldsTable; /// Derived. Lookup table for excluded fields. 168 169 /** Process command line arguments (getopt cover). 170 * 171 * processArgs calls getopt to process command line arguments. It does any additional 172 * validation and parameter derivations needed. A tuple is returned. First value is 173 * true if command line arguments were successfully processed and execution should 174 * continue, or false if an error occurred or the user asked for help. If false, the 175 * second value is the appropriate exit code (0 or 1). 176 * 177 * Returning true (execution continues) means args have been validated and derived 178 * values calculated. In addition, field indices have been converted to zero-based. 179 */ 180 auto processArgs (ref string[] cmdArgs) 181 { 182 import std.algorithm : any, each, maxElement; 183 import std.array : split; 184 import std.conv : to; 185 import std.format : format; 186 import std.getopt; 187 import std.path : baseName, stripExtension; 188 import std.typecons : Yes, No; 189 import tsv_utils.common.fieldlist; 190 import tsv_utils.common.utils : throwIfWindowsNewline; 191 192 bool helpVerbose = false; // --help-verbose 193 bool helpFields = false; // --help-fields 194 bool versionWanted = false; // --V|version 195 string fieldsArg; // --f|fields 196 string excludedFieldsArg; // --e|exclude 197 198 string fieldsOptionString = "f|fields"; 199 string excludedFieldsOptionString = "e|exclude"; 200 201 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 202 203 try 204 { 205 arraySep = ","; // Use comma to separate values in command line options 206 auto r = getopt( 207 cmdArgs, 208 "help-verbose", 209 " Print more detailed help.", 210 &helpVerbose, 211 212 "help-fields", 213 " Print help on specifying fields.", 214 &helpFields, 215 216 std.getopt.config.caseSensitive, 217 "H|header", 218 " Treat the first line of each file as a header.", 219 &hasHeader, 220 std.getopt.config.caseInsensitive, 221 222 fieldsOptionString, 223 "<field-list> Fields to retain. Fields are output in the order listed.", 224 &fieldsArg, 225 226 excludedFieldsOptionString, 227 "<field-list> Fields to exclude.", 228 &excludedFieldsArg, 229 230 "r|rest", 231 "first|last Output location for fields not included in '--f|fields'.", 232 &restArg, 233 234 "d|delimiter", 235 "CHR Character to use as field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", 236 &delim, 237 238 "line-buffered", 239 " Immediately output every line.", 240 &lineBuffered, 241 242 std.getopt.config.caseSensitive, 243 "V|version", 244 " Print version information and exit.", 245 &versionWanted, 246 std.getopt.config.caseInsensitive, 247 ); 248 249 if (r.helpWanted) 250 { 251 defaultGetoptPrinter(helpText, r.options); 252 return tuple(false, 0); 253 } 254 else if (helpVerbose) 255 { 256 defaultGetoptPrinter(helpTextVerbose, r.options); 257 return tuple(false, 0); 258 } 259 else if (helpFields) 260 { 261 writeln(fieldListHelpText); 262 return tuple(false, 0); 263 } 264 else if (versionWanted) 265 { 266 import tsv_utils.common.tsvutils_version; 267 writeln(tsvutilsVersionNotice("tsv-select")); 268 return tuple(false, 0); 269 } 270 271 /* Remaining command line args are files. Use standard input if files 272 * were not provided. Truncate cmdArgs to consume the arguments. 273 */ 274 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 275 cmdArgs.length = 1; 276 277 /* Validation and derivations - Do as much validation prior to header line 278 * processing as possible (avoids waiting on stdin). 279 * 280 * Note: fields and excludedFields depend on header line processing, but 281 * fieldsArg and excludedFieldsArg can be used to detect whether the 282 * command line argument was specified. 283 */ 284 285 enforce(!fieldsArg.empty || !excludedFieldsArg.empty, 286 "One of '--f|fields' or '--e|exclude' is required."); 287 288 string[] headerFields; 289 290 /* fieldListArgProcessing encapsulates the field list processing. It is 291 * called prior to reading the header line if headers are not being used, 292 * and after if headers are being used. 293 */ 294 void fieldListArgProcessing() 295 { 296 if (!fieldsArg.empty) 297 { 298 fields = fieldsArg 299 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)( 300 hasHeader, headerFields, fieldsOptionString) 301 .array; 302 } 303 304 size_t[] excludedFields; 305 306 if (!excludedFieldsArg.empty) 307 { 308 excludedFields = excludedFieldsArg 309 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)( 310 hasHeader, headerFields, excludedFieldsOptionString) 311 .array; 312 } 313 314 if (excludedFields.length > 0) 315 { 316 /* Make sure selected and excluded fields do not overlap. */ 317 foreach (e; excludedFields) 318 { 319 foreach (f; fields) 320 { 321 enforce(e != f, "'--f|fields' and '--e|exclude' have overlapping fields."); 322 } 323 } 324 325 /* '--exclude' changes '--rest' default to 'last'. */ 326 if (restArg == RestOption.none) restArg = RestOption.last; 327 328 /* Build the excluded field lookup table. 329 * 330 * Note: Users won't have any reason to expect memory is allocated based 331 * on the max field number. However, users might pick arbitrarily large 332 * numbers when trimming fields. So, limit the max field number to something 333 * big but reasonable (more than 1 million). The limit can be raised if use 334 * cases arise. 335 */ 336 size_t maxExcludedField = excludedFields.maxElement; 337 size_t maxAllowedExcludedField = 1024 * 1024; 338 339 enforce(maxExcludedField < maxAllowedExcludedField, 340 format("Maximum allowed '--e|exclude' field number is %d.", 341 maxAllowedExcludedField)); 342 343 excludedFieldsTable.length = maxExcludedField + 1; // Initialized to false 344 foreach (e; excludedFields) excludedFieldsTable[e] = true; 345 } 346 } 347 348 if (!hasHeader) fieldListArgProcessing(); 349 350 /* 351 * Create the byLineSourceRange and perform header line processing. 352 */ 353 immutable LineBuffered isLineBuffered = lineBuffered ? Yes.lineBuffered : No.lineBuffered; 354 immutable ReadHeader useReadHeader = hasHeader ? Yes.readHeader : No.readHeader; 355 inputSources = byLineSourceRange(filepaths, isLineBuffered, useReadHeader); 356 357 if (hasHeader) 358 { 359 if (!inputSources.front.byLine.empty) 360 { 361 throwIfWindowsNewline(inputSources.front.byLine.front, inputSources.front.name, 1); 362 headerFields = inputSources.front.byLine.front.split(delim).to!(string[]); 363 } 364 365 fieldListArgProcessing(); 366 } 367 368 } 369 catch (Exception exc) 370 { 371 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 372 return tuple(false, 1); 373 } 374 return tuple(true, 0); 375 } 376 } 377 378 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 379 380 /** Main program. 381 */ 382 int main(string[] cmdArgs) 383 { 384 /* When running in DMD code coverage mode, turn on report merging. */ 385 version(D_Coverage) version(DigitalMars) 386 { 387 import core.runtime : dmd_coverSetMerge; 388 dmd_coverSetMerge(true); 389 } 390 391 TsvSelectOptions cmdopt; 392 const r = cmdopt.processArgs(cmdArgs); 393 if (!r[0]) return r[1]; 394 version(LDC_Profile) 395 { 396 import ldc.profile : resetAll; 397 resetAll(); 398 } 399 try 400 { 401 /* Invoke the tsvSelect template matching the --rest option chosen. Option args 402 * are removed by command line processing (getopt). The program name and any files 403 * remain. Pass the files to tsvSelect. 404 */ 405 final switch (cmdopt.restArg) 406 { 407 case TsvSelectOptions.RestOption.none: 408 tsvSelect!(RestLocation.none)(cmdopt); 409 break; 410 case TsvSelectOptions.RestOption.first: 411 tsvSelect!(RestLocation.first)(cmdopt); 412 break; 413 case TsvSelectOptions.RestOption.last: 414 tsvSelect!(RestLocation.last)(cmdopt); 415 break; 416 } 417 } 418 catch (Exception exc) 419 { 420 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 421 return 1; 422 } 423 424 return 0; 425 } 426 427 // tsvSelect 428 429 /** Enumeration of the different specializations of the tsvSelect template. 430 * 431 * RestLocation is logically equivalent to the TsvSelectOptions.RestOption enum. It 432 * is used by main to choose the appropriate tsvSelect template instantiation to call. It 433 * is distinct from the TsvSelectOptions enum to separate it from the end-user UI. The 434 * TsvSelectOptions version specifies the text of allowed values in command line arguments. 435 */ 436 enum RestLocation { none, first, last }; 437 438 /** tsvSelect does the primary work of the tsv-select program. 439 * 440 * Input is read line by line, extracting the listed fields and writing them out in the order 441 * specified. An exception is thrown on error. 442 * 443 * This function is templatized with instantiations for the different --rest options. This 444 * avoids repeatedly running the same if-tests inside the inner loop. The main function 445 * instantiates this function three times, once for each of the --rest options. It results 446 * in a larger program, but is faster. Run-time improvements of 25% were measured compared 447 * to the non-templatized version. (Note: 'cte' stands for 'compile time evaluation'.) 448 */ 449 450 void tsvSelect(RestLocation rest)(ref TsvSelectOptions cmdopt) 451 { 452 import tsv_utils.common.utils: BufferedOutputRange, 453 ByLineSourceRange, InputFieldReordering, LineBuffered, throwIfWindowsNewline; 454 import std.algorithm: splitter; 455 import std.array : appender, Appender; 456 import std.format: format; 457 import std.range; 458 459 // Ensure the correct template instantiation was called. 460 static if (rest == RestLocation.none) 461 assert(cmdopt.restArg == TsvSelectOptions.RestOption.none); 462 else static if (rest == RestLocation.first) 463 assert(cmdopt.restArg == TsvSelectOptions.RestOption.first); 464 else static if (rest == RestLocation.last) 465 assert(cmdopt.restArg == TsvSelectOptions.RestOption.last); 466 else 467 static assert(false, "rest template argument does not match cmdopt.restArg."); 468 469 /* Check that the input files were setup as expected. Should at least have one 470 * input, stdin if nothing else, and newlines removed from the byLine range. 471 */ 472 assert(!cmdopt.inputSources.empty); 473 static assert(is(typeof(cmdopt.inputSources) == ByLineSourceRange!(No.keepTerminator))); 474 475 /* The algorithm here assumes RestOption.none is not used with --exclude-fields. */ 476 assert(cmdopt.excludedFieldsTable.length == 0 || rest != RestLocation.none); 477 478 /* InputFieldReordering copies select fields from an input line to a new buffer. 479 * The buffer is reordered in the process. 480 */ 481 auto fieldReordering = new InputFieldReordering!char(cmdopt.fields); 482 483 /* Fields not on the --fields list are added to a separate buffer so they can be 484 * output as a group (the --rest option). This is done using an 'Appender', which 485 * is faster than the ~= operator. The Appender is passed a GC allocated buffer 486 * that grows as needed and is reused for each line. Typically it'll grow only 487 * on the first line. 488 */ 489 static if (rest != RestLocation.none) 490 { 491 auto leftOverFieldsAppender = appender!(char[][]); 492 } 493 494 /* BufferedOutputRange (from common/utils.d) is a performance improvement over 495 * writing directly to stdout. 496 */ 497 immutable LineBuffered isLineBuffered = cmdopt.lineBuffered ? Yes.lineBuffered : No.lineBuffered; 498 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout, isLineBuffered); 499 500 /* Read each input file (or stdin) and iterate over each line. 501 */ 502 foreach (fileNum, inputStream; cmdopt.inputSources.enumerate) 503 { 504 foreach (lineNum, line; inputStream.byLine.enumerate(1)) 505 { 506 if (lineNum == 1) throwIfWindowsNewline(line, inputStream.name, lineNum); 507 508 if (lineNum == 1 && fileNum > 0 && cmdopt.hasHeader) 509 { 510 continue; // Drop the header line from all but the first file. 511 } 512 513 static if (rest != RestLocation.none) 514 { 515 leftOverFieldsAppender.clear; 516 517 /* Track the field location in the line. This enables bulk appending 518 * after the last specified field has been processed. 519 */ 520 size_t nextFieldStart = 0; 521 } 522 523 fieldReordering.initNewLine; 524 525 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 526 { 527 static if (rest == RestLocation.none) 528 { 529 fieldReordering.processNextField(fieldIndex, fieldValue); 530 if (fieldReordering.allFieldsFilled) break; 531 } 532 else 533 { 534 /* Processing with 'rest' fields. States: 535 * - Excluded fields and specified fields remain 536 * - Only specified fields remain 537 * - Only excluded fields remain 538 */ 539 540 nextFieldStart += fieldValue.length + 1; 541 bool excludedFieldsRemain = fieldIndex < cmdopt.excludedFieldsTable.length; 542 immutable isExcluded = excludedFieldsRemain && cmdopt.excludedFieldsTable[fieldIndex]; 543 544 if (!isExcluded) 545 { 546 immutable numMatched = fieldReordering.processNextField(fieldIndex, fieldValue); 547 548 if (numMatched == 0) leftOverFieldsAppender.put(fieldValue); 549 } 550 else if (fieldIndex + 1 == cmdopt.excludedFieldsTable.length) 551 { 552 excludedFieldsRemain = false; 553 } 554 555 if (fieldReordering.allFieldsFilled && !excludedFieldsRemain) 556 { 557 /* Processed all specified fields. Bulk append any fields 558 * remaining on the line. Cases: 559 * - Current field is last field: 560 */ 561 if (nextFieldStart <= line.length) 562 { 563 leftOverFieldsAppender.put(line[nextFieldStart .. $]); 564 } 565 566 break; 567 } 568 } 569 } 570 571 // Finished with all fields in the line. 572 enforce(fieldReordering.allFieldsFilled, 573 format("Not enough fields in line. File: %s, Line: %s", 574 inputStream.name, lineNum)); 575 576 // Write the re-ordered line. 577 578 static if (rest == RestLocation.first) 579 { 580 if (leftOverFieldsAppender.data.length > 0) 581 { 582 bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim); 583 if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim); 584 } 585 } 586 587 bufferedOutput.joinAppend(fieldReordering.outputFields, cmdopt.delim); 588 589 static if (rest == RestLocation.last) 590 { 591 if (leftOverFieldsAppender.data.length > 0) 592 { 593 if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim); 594 bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim); 595 } 596 } 597 598 bufferedOutput.appendln; 599 600 /* Send the first line of the first file immediately. This helps detect 601 * errors quickly in multi-stage unix pipelines. Note that tsv-select may 602 * have been sent one line from an upstream process, usually a header line. 603 */ 604 if (lineNum == 1 && fileNum == 0) bufferedOutput.flush; 605 } 606 } 607 }