1 /** 2 A variant of the unix 'cut' program, with the ability to reorder fields. 3 4 tsv-select is a variation on the Unix 'cut' utility, with the added ability to reorder 5 fields. Lines are read from files or standard input and split on a delimiter character. 6 Fields are written to standard output in the order listed. Fields can be listed more 7 than once, and fields not listed can be written out as a group. 8 9 This program is intended both as a useful utility and a D programming language example. 10 Functionality and constructs used include command line argument processing, file I/O, 11 exception handling, ranges, tuples and strings, templates, universal function call syntax 12 (UFCS), lambdas and functional programming constructs. Comments are more verbose than 13 typical to shed light on D programming constructs, but not to the level of a tutorial. 14 15 Copyright (c) 2015-2020, eBay Inc. 16 Initially written by Jon Degenhardt 17 18 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 19 */ 20 21 module tsv_utils.tsv_select; // Module name defaults to file name, but hyphens not allowed, so set it here. 22 23 // Imports used by multiple routines. Others imports made in local context. 24 import std.exception : enforce; 25 import std.stdio; 26 import std.typecons : tuple, Tuple; 27 28 // 'Heredoc' style help text. When printed it is followed by a getopt formatted option list. 29 immutable helpText = q"EOS 30 Synopsis: tsv-select [options] [file...] 31 32 tsv-select reads files or standard input and writes selected fields to 33 standard output. Fields are written in the order listed. This is similar 34 to Unix 'cut', but with the ability to reorder fields. 35 36 Fields numbers start with one. They are comma separated and ranges can be 37 used. Fields can be repeated, and fields not included in the '--f|fields' 38 option can be selected as a group using '--r|rest'. Fields can be dropped 39 using '--e|exclude'. Multiple files with header lines can be managed with 40 '--H|header', which retains the header of the first file only. 41 42 Examples: 43 44 # Output fields 2 and 1, in that order 45 tsv-select -f 2,1 data.tsv 46 47 # Drop the first field, keep everything else. 48 tsv-select --exclude 1 file.tsv 49 50 # Move the first field to the end 51 tsv-select -f 1 --rest first data.tsv 52 53 # Multiple files with header lines. Keep only one header. 54 tsv-select data*.tsv -H --fields 1,2,4-7,14 55 56 Use '--help-verbose' for detailed information. 57 58 Options: 59 EOS"; 60 61 immutable helpTextVerbose = q"EOS 62 Synopsis: tsv-select [options] [file...] 63 64 tsv-select reads files or standard input and writes selected fields to 65 standard output. Fields are written in the order listed. This is similar 66 to Unix 'cut', but with the ability to reorder fields. 67 68 Fields numbers start with one. They are comma separated and ranges can be 69 used. Fields can be repeated, and fields not included in the '--f|fields' 70 option can be selected as a group using '--r|rest'. Use '--H|header' to 71 retain the header line from only the first file. 72 73 Fields can be excluded using '--e|exclude'. All fields not excluded are 74 output. '--f|fields' and '--r|rest' can be used with '--e|exclude' to 75 reorder non-excluded fields. 76 77 Examples: 78 79 # Keep the first field from two files 80 tsv-select -f 1 file1.tsv file2.tsv 81 82 # Keep fields 1 and 2, retain the header from the first file 83 tsv-select -H -f 1,2 file1.tsv file2.tsv 84 85 # Field reordering and field ranges 86 tsv-select -f 3,2,1 file.tsv 87 tsv-select -f 1,4-7,11 file.tsv 88 tsv-select -f 1,7-4,11 file.tsv 89 90 # Repeating fields 91 tsv-select -f 1,2,1 file.tsv 92 tsv-select -f 1-3,3-1 file.tsv 93 94 # Move field 5 to the front 95 tsv-select -f 5 --rest last file.tsv 96 97 # Move fields 4 and 5 to the end 98 tsv-select -f 4,5 --rest first file.tsv 99 100 # Drop the first field, keep everything else 101 tsv-select --exclude 1 file.tsv 102 103 # Move field 2 to the front and drop fields 10-15 104 tsv-select -f 2 -e 10-15 file.tsv 105 106 # Move field 2 to the end, dropping fields 10-15 107 tsv-select -f 2 -rest first -e 10-15 file.tsv 108 109 Notes: 110 * One of '--f|fields' or '--e|exclude' is required. 111 * Fields specified by '--f|fields' and '--e|exclude' cannot overlap. 112 * When '--f|fields' and '--e|exclude' are used together, the effect is to 113 specify '--rest last'. This can be overridden by using '--rest first'. 114 * Each input line must be long enough to contain all fields specified with 115 '--f|fields'. This is not necessary for '--e|exclude' fields. 116 117 Options: 118 EOS"; 119 120 /** Container for command line options. 121 */ 122 struct TsvSelectOptions 123 { 124 import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange; 125 126 // The allowed values for the --rest option. 127 enum RestOption { none, first, last}; 128 129 string programName; /// Program name 130 ByLineSourceRange!() inputSources; /// Input Files 131 bool helpVerbose = false; /// --help-verbose 132 bool hasHeader = false; /// --H|header 133 char delim = '\t'; /// --d|delimiter 134 size_t[] fields; /// --f|fields 135 size_t[] excludedFieldsArg; /// --e|exclude 136 RestOption restArg; /// --rest first|last (none is hidden default) 137 bool versionWanted = false; /// --V|version 138 bool[] excludedFieldsTable; /// Derived. Lookup table for excluded fields. 139 140 /** Process command line arguments (getopt cover). 141 * 142 * processArgs calls getopt to process command line arguments. It does any additional 143 * validation and parameter derivations needed. A tuple is returned. First value is 144 * true if command line arguments were successfully processed and execution should 145 * continue, or false if an error occurred or the user asked for help. If false, the 146 * second value is the appropriate exit code (0 or 1). 147 * 148 * Returning true (execution continues) means args have been validated and derived 149 * values calculated. In addition, field indices have been converted to zero-based. 150 */ 151 auto processArgs (ref string[] cmdArgs) 152 { 153 import std.algorithm : any, each, maxElement; 154 import std.format : format; 155 import std.getopt; 156 import std.path : baseName, stripExtension; 157 import std.typecons : Yes, No; 158 import tsv_utils.common.utils : makeFieldListOptionHandler; 159 160 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 161 162 try 163 { 164 arraySep = ","; // Use comma to separate values in command line options 165 auto r = getopt( 166 cmdArgs, 167 "help-verbose", " Print more detailed help.", &helpVerbose, 168 169 std.getopt.config.caseSensitive, 170 "H|header", " Treat the first line of each file as a header.", &hasHeader, 171 std.getopt.config.caseInsensitive, 172 173 "f|fields", "<field-list> Fields to retain. Fields are output in the order listed.", 174 fields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex), 175 176 "e|exclude", "<field-list> Fields to exclude.", 177 excludedFieldsArg.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex), 178 179 "r|rest", "first|last Output location for fields not included in '--f|fields'.", &restArg, 180 "d|delimiter", "CHR Character to use as field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 181 std.getopt.config.caseSensitive, 182 "V|version", " Print version information and exit.", &versionWanted, 183 std.getopt.config.caseInsensitive, 184 ); 185 186 if (r.helpWanted) 187 { 188 defaultGetoptPrinter(helpText, r.options); 189 return tuple(false, 0); 190 } 191 else if (helpVerbose) 192 { 193 defaultGetoptPrinter(helpTextVerbose, r.options); 194 return tuple(false, 0); 195 } 196 else if (versionWanted) 197 { 198 import tsv_utils.common.tsvutils_version; 199 writeln(tsvutilsVersionNotice("tsv-select")); 200 return tuple(false, 0); 201 } 202 203 /* 204 * Consistency checks and derivations. 205 */ 206 207 enforce(fields.length != 0 || excludedFieldsArg.length != 0, 208 "One of '--f|fields' or '--e|exclude' is required."); 209 210 /* Remaining command line args are files. Use standard input if files 211 * were not provided. Truncate cmdArgs to consume the arguments. 212 */ 213 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 214 cmdArgs.length = 1; 215 inputSources = byLineSourceRange(filepaths); 216 217 if (excludedFieldsArg.length > 0) 218 { 219 /* Make sure selected and excluded fields do not overlap. */ 220 foreach (e; excludedFieldsArg) 221 { 222 foreach (f; fields) 223 { 224 enforce(e != f, "'--f|fields' and '--e|exclude' have overlapping fields."); 225 } 226 } 227 228 /* '--exclude' changes '--rest' default to 'last'. */ 229 if (restArg == RestOption.none) restArg = RestOption.last; 230 231 /* Build the excluded field lookup table. 232 * 233 * Note: Users won't have any reason to expect memory is allocated based 234 * on the max field number. However, users might pick arbitrarily large 235 * numbers when trimming fields. So, limit the max field number to something 236 * big but reasonable (more than 1 million). The limit can be raised if use 237 * cases arise. 238 */ 239 size_t maxExcludedField = excludedFieldsArg.maxElement; 240 size_t maxAllowedExcludedField = 1024 * 1024; 241 242 enforce(maxExcludedField < maxAllowedExcludedField, 243 format("Maximum allowed '--e|exclude' field number is %d.", 244 maxAllowedExcludedField)); 245 246 excludedFieldsTable.length = maxExcludedField + 1; // Initialized to false 247 foreach (e; excludedFieldsArg) excludedFieldsTable[e] = true; 248 } 249 } 250 catch (Exception exc) 251 { 252 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 253 return tuple(false, 1); 254 } 255 return tuple(true, 0); 256 } 257 } 258 259 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 260 261 /** Main program. 262 */ 263 int main(string[] cmdArgs) 264 { 265 /* When running in DMD code coverage mode, turn on report merging. */ 266 version(D_Coverage) version(DigitalMars) 267 { 268 import core.runtime : dmd_coverSetMerge; 269 dmd_coverSetMerge(true); 270 } 271 272 TsvSelectOptions cmdopt; 273 const r = cmdopt.processArgs(cmdArgs); 274 if (!r[0]) return r[1]; 275 version(LDC_Profile) 276 { 277 import ldc.profile : resetAll; 278 resetAll(); 279 } 280 try 281 { 282 /* Invoke the tsvSelect template matching the --rest option chosen. Option args 283 * are removed by command line processing (getopt). The program name and any files 284 * remain. Pass the files to tsvSelect. 285 */ 286 final switch (cmdopt.restArg) 287 { 288 case TsvSelectOptions.RestOption.none: 289 tsvSelect!(RestLocation.none)(cmdopt); 290 break; 291 case TsvSelectOptions.RestOption.first: 292 tsvSelect!(RestLocation.first)(cmdopt); 293 break; 294 case TsvSelectOptions.RestOption.last: 295 tsvSelect!(RestLocation.last)(cmdopt); 296 break; 297 } 298 } 299 catch (Exception exc) 300 { 301 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 302 return 1; 303 } 304 305 return 0; 306 } 307 308 // tsvSelect 309 310 /** Enumeration of the different specializations of the tsvSelect template. 311 * 312 * RestLocation is logically equivalent to the TsvSelectOptions.RestOption enum. It 313 * is used by main to choose the appropriate tsvSelect template instantiation to call. It 314 * is distinct from the TsvSelectOptions enum to separate it from the end-user UI. The 315 * TsvSelectOptions version specifies the text of allowed values in command line arguments. 316 */ 317 enum RestLocation { none, first, last }; 318 319 /** tsvSelect does the primary work of the tsv-select program. 320 * 321 * Input is read line by line, extracting the listed fields and writing them out in the order 322 * specified. An exception is thrown on error. 323 * 324 * This function is templatized with instantiations for the different --rest options. This 325 * avoids repeatedly running the same if-tests inside the inner loop. The main function 326 * instantiates this function three times, once for each of the --rest options. It results 327 * in a larger program, but is faster. Run-time improvements of 25% were measured compared 328 * to the non-templatized version. (Note: 'cte' stands for 'compile time evaluation'.) 329 */ 330 331 void tsvSelect(RestLocation rest)(ref TsvSelectOptions cmdopt) 332 { 333 import tsv_utils.common.utils: BufferedOutputRange, ByLineSourceRange, 334 InputFieldReordering, throwIfWindowsNewlineOnUnix; 335 import std.algorithm: splitter; 336 import std.array : appender, Appender; 337 import std.format: format; 338 import std.range; 339 340 // Ensure the correct template instantiation was called. 341 static if (rest == RestLocation.none) 342 assert(cmdopt.restArg == TsvSelectOptions.RestOption.none); 343 else static if (rest == RestLocation.first) 344 assert(cmdopt.restArg == TsvSelectOptions.RestOption.first); 345 else static if (rest == RestLocation.last) 346 assert(cmdopt.restArg == TsvSelectOptions.RestOption.last); 347 else 348 static assert(false, "rest template argument does not match cmdopt.restArg."); 349 350 /* Check that the input files were setup as expected. Should at least have one 351 * input, stdin if nothing else, and newlines removed from the byLine range. 352 */ 353 assert(!cmdopt.inputSources.empty); 354 static assert(is(typeof(cmdopt.inputSources) == ByLineSourceRange!(No.keepTerminator))); 355 356 /* The algorithm here assumes RestOption.none is not used with --exclude-fields. */ 357 assert(cmdopt.excludedFieldsTable.length == 0 || rest != RestLocation.none); 358 359 /* InputFieldReordering copies select fields from an input line to a new buffer. 360 * The buffer is reordered in the process. 361 */ 362 auto fieldReordering = new InputFieldReordering!char(cmdopt.fields); 363 364 /* Fields not on the --fields list are added to a separate buffer so they can be 365 * output as a group (the --rest option). This is done using an 'Appender', which 366 * is faster than the ~= operator. The Appender is passed a GC allocated buffer 367 * that grows as needed and is reused for each line. Typically it'll grow only 368 * on the first line. 369 */ 370 static if (rest != RestLocation.none) 371 { 372 auto leftOverFieldsAppender = appender!(char[][]); 373 } 374 375 /* BufferedOutputRange (from tsvutils.d) is a performance improvement over writing 376 * directly to stdout. 377 */ 378 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 379 380 /* Read each input file (or stdin) and iterate over each line. 381 */ 382 foreach (fileNum, inputStream; cmdopt.inputSources.enumerate) 383 { 384 foreach (lineNum, line; inputStream.byLine.enumerate(1)) 385 { 386 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum); 387 388 if (lineNum == 1 && fileNum > 0 && cmdopt.hasHeader) 389 { 390 continue; // Drop the header line from all but the first file. 391 } 392 393 static if (rest != RestLocation.none) 394 { 395 leftOverFieldsAppender.clear; 396 397 /* Track the field location in the line. This enables bulk appending 398 * after the last specified field has been processed. 399 */ 400 size_t nextFieldStart = 0; 401 } 402 403 fieldReordering.initNewLine; 404 405 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 406 { 407 static if (rest == RestLocation.none) 408 { 409 fieldReordering.processNextField(fieldIndex, fieldValue); 410 if (fieldReordering.allFieldsFilled) break; 411 } 412 else 413 { 414 /* Processing with 'rest' fields. States: 415 * - Excluded fields and specified fields remain 416 * - Only specified fields remain 417 * - Only excluded fields remain 418 */ 419 420 nextFieldStart += fieldValue.length + 1; 421 bool excludedFieldsRemain = fieldIndex < cmdopt.excludedFieldsTable.length; 422 immutable isExcluded = excludedFieldsRemain && cmdopt.excludedFieldsTable[fieldIndex]; 423 424 if (!isExcluded) 425 { 426 immutable numMatched = fieldReordering.processNextField(fieldIndex, fieldValue); 427 428 if (numMatched == 0) leftOverFieldsAppender.put(fieldValue); 429 } 430 else if (fieldIndex + 1 == cmdopt.excludedFieldsTable.length) 431 { 432 excludedFieldsRemain = false; 433 } 434 435 if (fieldReordering.allFieldsFilled && !excludedFieldsRemain) 436 { 437 /* Processed all specified fields. Bulk append any fields 438 * remaining on the line. Cases: 439 * - Current field is last field: 440 */ 441 if (nextFieldStart <= line.length) 442 { 443 leftOverFieldsAppender.put(line[nextFieldStart .. $]); 444 } 445 446 break; 447 } 448 } 449 } 450 451 // Finished with all fields in the line. 452 enforce(fieldReordering.allFieldsFilled, 453 format("Not enough fields in line. File: %s, Line: %s", 454 inputStream.name, lineNum)); 455 456 // Write the re-ordered line. 457 458 static if (rest == RestLocation.first) 459 { 460 if (leftOverFieldsAppender.data.length > 0) 461 { 462 bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim); 463 if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim); 464 } 465 } 466 467 bufferedOutput.joinAppend(fieldReordering.outputFields, cmdopt.delim); 468 469 static if (rest == RestLocation.last) 470 { 471 if (leftOverFieldsAppender.data.length > 0) 472 { 473 if (cmdopt.fields.length > 0) bufferedOutput.append(cmdopt.delim); 474 bufferedOutput.joinAppend(leftOverFieldsAppender.data, cmdopt.delim); 475 } 476 } 477 478 bufferedOutput.appendln; 479 } 480 } 481 }