1 /** 2 Command line tool that joins tab-separated value files based on a common key. 3 4 This tool joins lines from tab-delimited files based on a common key. One file, the 'filter' 5 file, contains the records (lines) being matched. The other input files are searched for 6 matching records. Matching records are written to standard output, along with any designated 7 fields from the 'filter' file. In database parlance this is a 'hash semi-join'. 8 9 Copyright (c) 2015-2020, eBay Inc. 10 Initially written by Jon Degenhardt 11 12 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 13 */ 14 module tsv_utils.tsv_join; 15 16 import std.exception : enforce; 17 import std.stdio; 18 import std.format : format; 19 import std.typecons : tuple; 20 21 auto helpText = q"EOS 22 Synopsis: tsv-join --filter-file file [options] file [file...] 23 24 tsv-join matches input lines against lines from a 'filter' file. The match is 25 based on fields or the entire line. Use '--help-verbose' for more details. 26 27 Options: 28 EOS"; 29 30 auto helpTextVerbose = q"EOS 31 Synopsis: tsv-join --filter-file file [options] file [file...] 32 33 tsv-join matches input lines against lines from a 'filter' file. The match is 34 based on exact match comparison of one or more 'key' fields. Fields are TAB 35 delimited by default. Matching lines are written to standard output, along with 36 any additional fields from the key file that have been specified. An example: 37 38 tsv-join --filter-file filter.tsv --key-fields 1 --append-fields 5,6 data.tsv 39 40 This reads filter.tsv, creating a hash table keyed on field 1. Lines from data.tsv 41 are read one at a time. If field 1 is found in the hash table, the line is written 42 to standard output with fields 5 and 6 from the filter file appended. In database 43 parlance this is a "hash semi join". Note the asymmetric relationship: Records in 44 the filter file should be unique, but data.tsv lines can repeat. 45 46 tsv-join can also work as a simple filter, this is the default behavior. Example: 47 48 tsv-join --filter-file filter.tsv data.tsv 49 50 This outputs all lines from data.tsv found in filter.tsv. --key-fields can still 51 be used to define the match key. The --exclude option can be used to exclude 52 matched lines rather than keep them. 53 54 Multiple fields can be specified as keys and append fields. Field numbers start 55 at one, zero represents the whole line. Fields are comma separated and ranges 56 can be used. Example: 57 58 tsv-join -f filter.tsv -k 1,2 --append-fields 3-7 data.tsv 59 60 Options: 61 EOS"; 62 63 /** Container for command line options. 64 */ 65 struct TsvJoinOptions 66 { 67 import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange, 68 inputSourceRange, InputSourceRange, ReadHeader; 69 70 /* Data available the main program. Variables used only command line argument 71 * processing are local to processArgs. 72 */ 73 string programName; 74 InputSourceRange inputSources; // Input Files 75 ByLineSourceRange!() filterSource; // Derived: --filter 76 size_t[] keyFields; // --key-fields 77 size_t[] dataFields; // --data-fields 78 size_t[] appendFields; // --append-fields 79 bool hasHeader = false; // --H|header 80 string appendHeaderPrefix = ""; // --append-header-prefix 81 bool writeAll = false; // --write-all 82 string writeAllValue; // --write-all 83 bool exclude = false; // --exclude 84 char delim = '\t'; // --delimiter 85 bool allowDupliateKeys = false; // --allow-duplicate-keys 86 bool keyIsFullLine = false; // Derived: --key-fields 0 87 bool dataIsFullLine = false; // Derived: --data-fields 0 88 bool appendFullLine = false; // Derived: --append-fields 0 89 90 /* Returns a tuple. First value is true if command line arguments were successfully 91 * processed and execution should continue, or false if an error occurred or the user 92 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 93 * 94 * Returning true (execution continues) means args have been validated and derived 95 * values calculated. In addition, field indices have been converted to zero-based. 96 * If the whole line is the key, the individual fields lists will be cleared. 97 */ 98 auto processArgs (ref string[] cmdArgs) 99 { 100 import std.getopt; 101 import std.path : baseName, stripExtension; 102 import std.typecons : Yes, No; 103 import tsv_utils.common.utils : makeFieldListOptionHandler; 104 105 string filterFile; // --filter 106 bool helpVerbose = false; // --help-verbose 107 bool versionWanted = false; // --V|version 108 109 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 110 111 /* Handler for --write-all. Special handler so two values can be set. */ 112 void writeAllHandler(string option, string value) 113 { 114 debug stderr.writeln("[writeAllHandler] |", option, "| |", value, "|"); 115 writeAll = true; 116 writeAllValue = value; 117 } 118 119 try 120 { 121 arraySep = ","; // Use comma to separate values in command line options 122 auto r = getopt( 123 cmdArgs, 124 "help-verbose", " Print full help.", &helpVerbose, 125 "f|filter-file", "FILE (Required) File with records to use as a filter.", &filterFile, 126 127 "k|key-fields", "<field-list> Fields to use as join key. Default: 0 (entire line).", 128 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero), 129 130 "d|data-fields", "<field-list> Data record fields to use as join key, if different than --key-fields.", 131 dataFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero), 132 133 "a|append-fields", "<field-list> Filter fields to append to matched records.", 134 appendFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero), 135 136 std.getopt.config.caseSensitive, 137 "H|header", " Treat the first line of each file as a header.", &hasHeader, 138 std.getopt.config.caseInsensitive, 139 "p|prefix", "STR String to use as a prefix for --append-fields when writing a header line.", &appendHeaderPrefix, 140 "w|write-all", "STR Output all data records. STR is the --append-fields value when writing unmatched records.", &writeAllHandler, 141 "e|exclude", " Exclude matching records.", &exclude, 142 "delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 143 "z|allow-duplicate-keys", 144 " Allow duplicate keys with different append values (last entry wins).", &allowDupliateKeys, 145 std.getopt.config.caseSensitive, 146 "V|version", " Print version information and exit.", &versionWanted, 147 std.getopt.config.caseInsensitive, 148 ); 149 150 if (r.helpWanted) 151 { 152 defaultGetoptPrinter(helpText, r.options); 153 return tuple(false, 0); 154 } 155 else if (helpVerbose) 156 { 157 defaultGetoptPrinter(helpTextVerbose, r.options); 158 return tuple(false, 0); 159 } 160 else if (versionWanted) 161 { 162 import tsv_utils.common.tsvutils_version; 163 writeln(tsvutilsVersionNotice("tsv-join")); 164 return tuple(false, 0); 165 } 166 167 /* File arguments. 168 * * --filter-file required, converted to a one-element ByLineSourceRange 169 * * Remaining command line args are input files. 170 */ 171 enforce(filterFile.length != 0, 172 "Required option --filter-file was not supplied."); 173 174 enforce(!(filterFile == "-" && cmdArgs.length == 1), 175 "A data file is required when standard input is used for the filter file (--f|filter-file -)."); 176 177 filterSource = byLineSourceRange([filterFile]); 178 179 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 180 cmdArgs.length = 1; 181 ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader; 182 inputSources = inputSourceRange(filepaths, readHeader); 183 184 consistencyValidations(cmdArgs); 185 derivations(); 186 } 187 catch (Exception exc) 188 { 189 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 190 return tuple(false, 1); 191 } 192 return tuple(true, 0); 193 } 194 195 /* This routine does validations not handled by getopt, usually because they 196 * involve interactions between multiple parameters. 197 */ 198 private void consistencyValidations(ref string[] processedCmdArgs) 199 { 200 import std.algorithm : all; 201 202 if (writeAll) 203 { 204 enforce(appendFields.length != 0, 205 "Use --a|append-fields when using --w|write-all."); 206 207 enforce(!(appendFields.length == 1 && appendFields[0] == 0), 208 "Cannot use '--a|append-fields 0' (whole line) when using --w|write-all."); 209 } 210 211 enforce(!(appendFields.length > 0 && exclude), 212 "--e|exclude cannot be used with --a|append-fields."); 213 214 enforce(appendHeaderPrefix.length == 0 || hasHeader, 215 "Use --header when using --p|prefix."); 216 217 enforce(dataFields.length == 0 || keyFields.length == dataFields.length, 218 "Different number of --k|key-fields and --d|data-fields."); 219 220 enforce(keyFields.length != 1 || 221 dataFields.length != 1 || 222 (keyFields[0] == 0 && dataFields[0] == 0) || 223 (keyFields[0] != 0 && dataFields[0] != 0), 224 "If either --k|key-field or --d|data-field is zero both must be zero."); 225 226 enforce((keyFields.length <= 1 || all!(a => a != 0)(keyFields)) && 227 (dataFields.length <= 1 || all!(a => a != 0)(dataFields)) && 228 (appendFields.length <= 1 || all!(a => a != 0)(appendFields)), 229 "Field 0 (whole line) cannot be combined with individual fields (non-zero)."); 230 } 231 232 /* Post-processing derivations. */ 233 void derivations() 234 { 235 import std.algorithm : each; 236 import std.range; 237 238 // Convert 'full-line' field indexes (index zero) to boolean flags. 239 if (keyFields.length == 0) 240 { 241 assert(dataFields.length == 0); 242 keyIsFullLine = true; 243 dataIsFullLine = true; 244 } 245 else if (keyFields.length == 1 && keyFields[0] == 0) 246 { 247 keyIsFullLine = true; 248 keyFields.popFront; 249 dataIsFullLine = true; 250 251 if (dataFields.length == 1) 252 { 253 assert(dataFields[0] == 0); 254 dataFields.popFront; 255 } 256 } 257 258 if (appendFields.length == 1 && appendFields[0] == 0) 259 { 260 appendFullLine = true; 261 appendFields.popFront; 262 } 263 264 assert(!(keyIsFullLine && keyFields.length > 0)); 265 assert(!(dataIsFullLine && dataFields.length > 0)); 266 assert(!(appendFullLine && appendFields.length > 0)); 267 268 // Switch to zero-based field indexes. 269 keyFields.each!((ref a) => --a); 270 dataFields.each!((ref a) => --a); 271 appendFields.each!((ref a) => --a); 272 } 273 } 274 275 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 276 277 /** Main program. 278 */ 279 int main(string[] cmdArgs) 280 { 281 /* When running in DMD code coverage mode, turn on report merging. */ 282 version(D_Coverage) version(DigitalMars) 283 { 284 import core.runtime : dmd_coverSetMerge; 285 dmd_coverSetMerge(true); 286 } 287 288 TsvJoinOptions cmdopt; 289 auto r = cmdopt.processArgs(cmdArgs); 290 if (!r[0]) return r[1]; 291 try tsvJoin(cmdopt); 292 catch (Exception exc) 293 { 294 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 295 return 1; 296 } 297 return 0; 298 } 299 300 /** tsvJoin does the primary work of the tsv-join program. 301 */ 302 void tsvJoin(ref TsvJoinOptions cmdopt) 303 { 304 import tsv_utils.common.utils : ByLineSourceRange, bufferedByLine, BufferedOutputRange, 305 InputFieldReordering, InputSourceRange, throwIfWindowsNewlineOnUnix; 306 import std.algorithm : splitter; 307 import std.array : join; 308 import std.range; 309 import std.conv : to; 310 311 /* Check that the input files were setup correctly. Should have one filter file as a 312 * ByLineSourceRange. There should be at least one input file as an InputSourceRange. 313 */ 314 assert(cmdopt.filterSource.length == 1); 315 static assert(is(typeof(cmdopt.filterSource) == ByLineSourceRange!(No.keepTerminator))); 316 317 assert(!cmdopt.inputSources.empty); 318 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 319 320 /* State, variables, and convenience derivations. 321 * 322 * Combinations of individual fields and whole line (field zero) are convenient for the 323 * user, but create complexities for the program. Many combinations are disallowed by 324 * command line processing, but the remaining combos still leave several states. Also, 325 * this code optimizes by doing only necessary operations, further complicating state 326 * Here's a guide to variables and state. 327 * - cmdopt.keyFields, cmdopt.dataFields arrays - Individual field indexes used as keys. 328 * Empty if the whole line is used as a key. Must be the same length. 329 * - cmdopt.keyIsFullLine, cmdopt.dataIsFullLine - True when the whole line is used key. 330 * - cmdopt.appendFields array - Indexes of individual filter file fields being appended. 331 * Empty if appending the full line, or if not appending anything. 332 * - cmdopt.appendFullLine - True when the whole line is being appended. 333 * - isAppending - True is something is being appended. 334 * - cmdopt.writeAll - True if all lines are being written 335 */ 336 /* Convenience derivations. */ 337 auto numKeyFields = cmdopt.keyFields.length; 338 auto numAppendFields = cmdopt.appendFields.length; 339 bool isAppending = (cmdopt.appendFullLine || numAppendFields > 0); 340 341 /* Mappings from field indexes in the input lines to collection arrays. */ 342 auto filterKeysReordering = new InputFieldReordering!char(cmdopt.keyFields); 343 auto dataKeysReordering = (cmdopt.dataFields.length == 0) ? 344 filterKeysReordering : new InputFieldReordering!char(cmdopt.dataFields); 345 auto appendFieldsReordering = new InputFieldReordering!char(cmdopt.appendFields); 346 347 /* The master filter hash. The key is the delimited fields concatenated together 348 * (including separators). The value is the appendFields concatenated together, as 349 * they will be appended to the input line. Both the keys and append fields are 350 * assembled in the order specified, though this only required for append fields. 351 */ 352 string[string] filterHash; 353 string appendFieldsHeader; 354 355 /* The append values for unmatched records. */ 356 char[] appendFieldsUnmatchedValue; 357 358 if (cmdopt.writeAll) 359 { 360 assert(cmdopt.appendFields.length > 0); // Checked in consistencyValidations 361 362 // reserve space for n values and n-1 delimiters 363 appendFieldsUnmatchedValue.reserve(cmdopt.appendFields.length * (cmdopt.writeAllValue.length + 1) - 1); 364 365 appendFieldsUnmatchedValue ~= cmdopt.writeAllValue; 366 for (size_t i = 1; i < cmdopt.appendFields.length; ++i) 367 { 368 appendFieldsUnmatchedValue ~= cmdopt.delim; 369 appendFieldsUnmatchedValue ~= cmdopt.writeAllValue; 370 } 371 } 372 373 /* Read the filter file. */ 374 { 375 bool needPerFieldProcessing = (numKeyFields > 0) || (numAppendFields > 0); 376 auto filterStream = cmdopt.filterSource.front; 377 foreach (lineNum, line; filterStream.byLine.enumerate(1)) 378 { 379 debug writeln("[filter line] |", line, "|"); 380 if (needPerFieldProcessing) 381 { 382 filterKeysReordering.initNewLine; 383 appendFieldsReordering.initNewLine; 384 385 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 386 { 387 filterKeysReordering.processNextField(fieldIndex,fieldValue); 388 appendFieldsReordering.processNextField(fieldIndex,fieldValue); 389 390 if (filterKeysReordering.allFieldsFilled && appendFieldsReordering.allFieldsFilled) 391 { 392 break; 393 } 394 } 395 396 // Processed all fields in the line. 397 enforce(filterKeysReordering.allFieldsFilled && appendFieldsReordering.allFieldsFilled, 398 format("Not enough fields in line. File: %s, Line: %s", 399 filterStream.name, lineNum)); 400 } 401 402 string key = cmdopt.keyIsFullLine ? 403 line.to!string : filterKeysReordering.outputFields.join(cmdopt.delim).to!string; 404 string appendValues = cmdopt.appendFullLine ? 405 line.to!string : appendFieldsReordering.outputFields.join(cmdopt.delim).to!string; 406 407 debug writeln(" --> [key]:[append] => [", key, "]:[", appendValues, "]"); 408 409 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filterStream.name, lineNum); 410 411 if (lineNum == 1 && cmdopt.hasHeader) 412 { 413 if (cmdopt.appendHeaderPrefix.length == 0) 414 { 415 appendFieldsHeader = appendValues; 416 } 417 else 418 { 419 foreach (fieldIndex, fieldValue; appendValues.splitter(cmdopt.delim).enumerate) 420 { 421 if (fieldIndex > 0) appendFieldsHeader ~= cmdopt.delim; 422 appendFieldsHeader ~= cmdopt.appendHeaderPrefix; 423 appendFieldsHeader ~= fieldValue; 424 } 425 } 426 } 427 else 428 { 429 if (isAppending && !cmdopt.allowDupliateKeys) 430 { 431 string* currAppendValues = (key in filterHash); 432 433 enforce(currAppendValues is null || *currAppendValues == appendValues, 434 format("Duplicate keys with different append values (use --z|allow-duplicate-keys to ignore)\n [key 1][values]: [%s][%s]\n [key 2][values]: [%s][%s]", 435 key, *currAppendValues, key, appendValues)); 436 } 437 filterHash[key] = appendValues; 438 } 439 } 440 441 /* popFront here closes the filter file. */ 442 cmdopt.filterSource.popFront; 443 } 444 445 /* Now process each input file, one line at a time. */ 446 447 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 448 449 /* First header is read during command line argument processing. */ 450 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 451 { 452 auto inputStream = cmdopt.inputSources.front; 453 throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 454 455 bufferedOutput.append(inputStream.header); 456 if (isAppending) 457 { 458 bufferedOutput.append(cmdopt.delim); 459 bufferedOutput.append(appendFieldsHeader); 460 } 461 bufferedOutput.appendln; 462 } 463 464 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 465 466 foreach (inputStream; cmdopt.inputSources) 467 { 468 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 469 470 foreach (lineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine)) 471 { 472 debug writeln("[input line] |", line, "|"); 473 474 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum); 475 476 /* 477 * Next block checks if the input line matches a hash entry. Two cases: 478 * a) The whole line is the key. Simply look it up in the hash. 479 * b) Individual fields are used as the key - Assemble key and look it up. 480 * 481 * At the end of the appendFields will contain the result of hash lookup. 482 */ 483 string* appendFields; 484 if (cmdopt.keyIsFullLine) 485 { 486 appendFields = (line in filterHash); 487 } 488 else 489 { 490 dataKeysReordering.initNewLine; 491 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 492 { 493 dataKeysReordering.processNextField(fieldIndex, fieldValue); 494 if (dataKeysReordering.allFieldsFilled) break; 495 } 496 // Processed all fields in the line. 497 enforce(dataKeysReordering.allFieldsFilled, 498 format("Not enough fields in line. File: %s, Line: %s", 499 inputStream.name, lineNum)); 500 501 appendFields = (dataKeysReordering.outputFields.join(cmdopt.delim) in filterHash); 502 } 503 504 bool matched = (appendFields !is null); 505 debug writeln(" --> matched? ", matched); 506 if (cmdopt.writeAll || (matched && !cmdopt.exclude) || (!matched && cmdopt.exclude)) 507 { 508 bufferedOutput.append(line); 509 if (isAppending) 510 { 511 bufferedOutput.append(cmdopt.delim); 512 bufferedOutput.append(matched ? *appendFields : appendFieldsUnmatchedValue); 513 } 514 bufferedOutput.appendln(); 515 } 516 } 517 } 518 }