1 /** 2 Command line tool that joins tab-separated value files based on a common key. 3 4 This tool joins lines from tab-delimited files based on a common key. One file, the 'filter' 5 file, contains the records (lines) being matched. The other input files are searched for 6 matching records. Matching records are written to standard output, along with any designated 7 fields from the 'filter' file. In database parlance this is a 'hash semi-join'. 8 9 Copyright (c) 2015-2018, eBay Software Foundation 10 Initially written by Jon Degenhardt 11 12 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 13 */ 14 module tsv_join; 15 16 import std.stdio; 17 import std.format : format; 18 import std.typecons : tuple; 19 20 auto helpText = q"EOS 21 Synopsis: tsv-join --filter-file file [options] file [file...] 22 23 tsv-join matches input lines against lines from a 'filter' file. The match is 24 based on fields or the entire line. Use '--help-verbose' for more details. 25 26 Options: 27 EOS"; 28 29 auto helpTextVerbose = q"EOS 30 Synopsis: tsv-join --filter-file file [options] file [file...] 31 32 tsv-join matches input lines against lines from a 'filter' file. The match is 33 based on exact match comparison of one or more 'key' fields. Fields are TAB 34 delimited by default. Matching lines are written to standard output, along with 35 any additional fields from the key file that have been specified. An example: 36 37 tsv-join --filter-file filter.tsv --key-fields 1 --append-fields 5,6 data.tsv 38 39 This reads filter.tsv, creating a hash table keyed on field 1. Lines from data.tsv 40 are read one at a time. If field 1 is found in the hash table, the line is written 41 to standard output with fields 5 and 6 from the filter file appended. In database 42 parlance this is a "hash semi join". Note the asymmetric relationship: Records in 43 the filter file should be unique, but data.tsv lines can repeat. 44 45 tsv-join can also work as a simple filter, this is the default behavior. Example: 46 47 tsv-join --filter-file filter.tsv data.tsv 48 49 This outputs all lines from data.tsv found in filter.tsv. --key-fields can still 50 be used to define the match key. The --exclude option can be used to exclude 51 matched lines rather than keep them. 52 53 Multiple fields can be specified as keys and append fields. Field numbers start 54 at one, zero represents the whole line. Fields are comma separated and ranges 55 can be used. Example: 56 57 tsv-join -f filter.tsv -k 1,2 --append-fields 3-7 data.tsv 58 59 Options: 60 EOS"; 61 62 /** Container for command line options. 63 */ 64 struct TsvJoinOptions 65 { 66 string programName; 67 string filterFile; // --filter 68 size_t[] keyFields; // --key-fields 69 size_t[] dataFields; // --data-fields 70 size_t[] appendFields; // --append-fields 71 bool hasHeader = false; // --H|header 72 string appendHeaderPrefix = ""; // --append-header-prefix 73 bool writeAll = false; // --write-all 74 string writeAllValue; // --write-all 75 bool exclude = false; // --exclude 76 char delim = '\t'; // --delimiter 77 bool helpVerbose = false; // --help-verbose 78 bool versionWanted = false; // --V|version 79 bool allowDupliateKeys = false; // --allow-duplicate-keys 80 bool keyIsFullLine = false; // Derived: --key-fields 0 81 bool dataIsFullLine = false; // Derived: --data-fields 0 82 bool appendFullLine = false; // Derived: --append-fields 0 83 84 /* Returns a tuple. First value is true if command line arguments were successfully 85 * processed and execution should continue, or false if an error occurred or the user 86 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 87 * 88 * Returning true (execution continues) means args have been validated and derived 89 * values calculated. In addition, field indices have been converted to zero-based. 90 * If the whole line is the key, the individual fields lists will be cleared. 91 */ 92 auto processArgs (ref string[] cmdArgs) 93 { 94 import std.algorithm : any, each; 95 import std.getopt; 96 import std.path : baseName, stripExtension; 97 import std.typecons : Yes, No; 98 import tsvutil : makeFieldListOptionHandler; 99 100 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 101 102 /* Handler for --write-all. Special handler so two values can be set. */ 103 void writeAllHandler(string option, string value) 104 { 105 debug stderr.writeln("[writeAllHandler] |", option, "| |", value, "|"); 106 writeAll = true; 107 writeAllValue = value; 108 } 109 110 try 111 { 112 arraySep = ","; // Use comma to separate values in command line options 113 auto r = getopt( 114 cmdArgs, 115 "help-verbose", " Print full help.", &helpVerbose, 116 "f|filter-file", "FILE (Required) File with records to use as a filter.", &filterFile, 117 118 "k|key-fields", "<field-list> Fields to use as join key. Default: 0 (entire line).", 119 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero), 120 121 "d|data-fields", "<field-list> Data record fields to use as join key, if different than --key-fields.", 122 dataFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero), 123 124 "a|append-fields", "<field-list> Filter fields to append to matched records.", 125 appendFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero), 126 127 std.getopt.config.caseSensitive, 128 "H|header", " Treat the first line of each file as a header.", &hasHeader, 129 std.getopt.config.caseInsensitive, 130 "p|prefix", "STR String to use as a prefix for --append-fields when writing a header line.", &appendHeaderPrefix, 131 "w|write-all", "STR Output all data records. STR is the --append-fields value when writing unmatched records.", &writeAllHandler, 132 "e|exclude", " Exclude matching records.", &exclude, 133 "delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 134 "z|allow-duplicate-keys", 135 " Allow duplicate keys with different append values (last entry wins).", &allowDupliateKeys, 136 std.getopt.config.caseSensitive, 137 "V|version", " Print version information and exit.", &versionWanted, 138 std.getopt.config.caseInsensitive, 139 ); 140 141 if (r.helpWanted) 142 { 143 defaultGetoptPrinter(helpText, r.options); 144 return tuple(false, 0); 145 } 146 else if (helpVerbose) 147 { 148 defaultGetoptPrinter(helpTextVerbose, r.options); 149 return tuple(false, 0); 150 } 151 else if (versionWanted) 152 { 153 import tsvutils_version; 154 writeln(tsvutilsVersionNotice("tsv-join")); 155 return tuple(false, 0); 156 } 157 158 consistencyValidations(cmdArgs); 159 derivations(); 160 } 161 catch (Exception exc) 162 { 163 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 164 return tuple(false, 1); 165 } 166 return tuple(true, 0); 167 } 168 169 /* This routine does validations not handled by getopt, usually because they 170 * involve interactions between multiple parameters. 171 */ 172 private void consistencyValidations(ref string[] processedCmdArgs) 173 { 174 import std.algorithm : any; 175 176 if (filterFile.length == 0) 177 { 178 throw new Exception("Required option --filter-file was not supplied."); 179 } 180 else if (filterFile == "-" && processedCmdArgs.length == 1) 181 { 182 throw new Exception("A data file is required when standard input is used for the filter file (--f|filter-file -)."); 183 } 184 185 if (writeAll && appendFields.length == 0) 186 { 187 throw new Exception("Use --a|append-fields when using --w|write-all."); 188 } 189 190 if (writeAll && appendFields.length == 1 && appendFields[0] == 0) 191 { 192 throw new Exception("Cannot use '--a|append-fields 0' (whole line) when using --w|write-all."); 193 } 194 195 if (appendFields.length > 0 && exclude) 196 { 197 throw new Exception("--e|exclude cannot be used with --a|append-fields."); 198 } 199 200 if (appendHeaderPrefix.length > 0 && !hasHeader) 201 { 202 throw new Exception("Use --header when using --p|prefix."); 203 } 204 205 if (dataFields.length > 0 && keyFields.length != dataFields.length) 206 { 207 throw new Exception("Different number of --k|key-fields and --d|data-fields."); 208 } 209 210 if (keyFields.length == 1 && dataFields.length == 1 && 211 ((keyFields[0] == 0 && dataFields[0] != 0) || (keyFields[0] != 0 && dataFields[0] == 0))) 212 { 213 throw new Exception("If either --k|key-field or --d|data-field is zero both must be zero."); 214 } 215 216 if ((keyFields.length > 1 && any!(a => a == 0)(keyFields)) || 217 (dataFields.length > 1 && any!(a => a == 0)(dataFields)) || 218 (appendFields.length > 1 && any!(a => a == 0)(appendFields))) 219 { 220 throw new Exception("Field 0 (whole line) cannot be combined with individual fields (non-zero)."); 221 } 222 223 } 224 225 /* Post-processing derivations. */ 226 void derivations() 227 { 228 import std.algorithm : each; 229 import std.range; 230 231 // Convert 'full-line' field indexes (index zero) to boolean flags. 232 if (keyFields.length == 0) 233 { 234 assert(dataFields.length == 0); 235 keyIsFullLine = true; 236 dataIsFullLine = true; 237 } 238 else if (keyFields.length == 1 && keyFields[0] == 0) 239 { 240 keyIsFullLine = true; 241 keyFields.popFront; 242 dataIsFullLine = true; 243 244 if (dataFields.length == 1) 245 { 246 assert(dataFields[0] == 0); 247 dataFields.popFront; 248 } 249 } 250 251 if (appendFields.length == 1 && appendFields[0] == 0) 252 { 253 appendFullLine = true; 254 appendFields.popFront; 255 } 256 257 assert(!(keyIsFullLine && keyFields.length > 0)); 258 assert(!(dataIsFullLine && dataFields.length > 0)); 259 assert(!(appendFullLine && appendFields.length > 0)); 260 261 // Switch to zero-based field indexes. 262 keyFields.each!((ref a) => --a); 263 dataFields.each!((ref a) => --a); 264 appendFields.each!((ref a) => --a); 265 } 266 } 267 268 /** Main program. 269 */ 270 int main(string[] cmdArgs) 271 { 272 /* When running in DMD code coverage mode, turn on report merging. */ 273 version(D_Coverage) version(DigitalMars) 274 { 275 import core.runtime : dmd_coverSetMerge; 276 dmd_coverSetMerge(true); 277 } 278 279 TsvJoinOptions cmdopt; 280 auto r = cmdopt.processArgs(cmdArgs); 281 if (!r[0]) return r[1]; 282 try tsvJoin(cmdopt, cmdArgs[1..$]); 283 catch (Exception exc) 284 { 285 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 286 return 1; 287 } 288 return 0; 289 } 290 291 /** tsvJoin does the primary work of the tsv-join program. 292 */ 293 void tsvJoin(in TsvJoinOptions cmdopt, in string[] inputFiles) 294 { 295 import tsvutil : InputFieldReordering, BufferedOutputRange, throwIfWindowsNewlineOnUnix; 296 import std.algorithm : splitter; 297 import std.array : join; 298 import std.range; 299 import std.conv : to; 300 301 /* State, variables, and convenience derivations. 302 * 303 * Combinations of individual fields and whole line (field zero) are convenient for the 304 * user, but create complexities for the program. Many combinations are disallowed by 305 * command line processing, but the remaining combos still leave several states. Also, 306 * this code optimizes by doing only necessary operations, further complicating state 307 * Here's a guide to variables and state. 308 * - cmdopt.keyFields, cmdopt.dataFields arrays - Individual field indexes used as keys. 309 * Empty if the whole line is used as a key. Must be the same length. 310 * - cmdopt.keyIsFullLine, cmdopt.dataIsFullLine - True when the whole line is used key. 311 * - cmdopt.appendFields array - Indexes of individual filter file fields being appended. 312 * Empty if appending the full line, or if not appending anything. 313 * - cmdopt.appendFullLine - True when the whole line is being appended. 314 * - isAppending - True is something is being appended. 315 * - cmdopt.writeAll - True if all lines are being written 316 */ 317 /* Convenience derivations. */ 318 auto numKeyFields = cmdopt.keyFields.length; 319 auto numAppendFields = cmdopt.appendFields.length; 320 bool isAppending = (cmdopt.appendFullLine || numAppendFields > 0); 321 322 /* Mappings from field indexes in the input lines to collection arrays. */ 323 auto filterKeysReordering = new InputFieldReordering!char(cmdopt.keyFields); 324 auto dataKeysReordering = (cmdopt.dataFields.length == 0) ? 325 filterKeysReordering : new InputFieldReordering!char(cmdopt.dataFields); 326 auto appendFieldsReordering = new InputFieldReordering!char(cmdopt.appendFields); 327 328 /* The master filter hash. The key is the delimited fields concatenated together 329 * (including separators). The value is the appendFields concatenated together, as 330 * they will be appended to the input line. Both the keys and append fields are 331 * assembled in the order specified, though this only required for append fields. 332 */ 333 string[string] filterHash; 334 string appendFieldsHeader; 335 336 /* The append values for unmatched records. */ 337 char[] appendFieldsUnmatchedValue; 338 339 if (cmdopt.writeAll) 340 { 341 assert(cmdopt.appendFields.length > 0); // Checked in consistencyValidations 342 343 // reserve space for n values and n-1 delimiters 344 appendFieldsUnmatchedValue.reserve(cmdopt.appendFields.length * (cmdopt.writeAllValue.length + 1) - 1); 345 346 appendFieldsUnmatchedValue ~= cmdopt.writeAllValue; 347 for (size_t i = 1; i < cmdopt.appendFields.length; ++i) 348 { 349 appendFieldsUnmatchedValue ~= cmdopt.delim; 350 appendFieldsUnmatchedValue ~= cmdopt.writeAllValue; 351 } 352 } 353 354 /* Read the filter file. */ 355 { 356 bool needPerFieldProcessing = (numKeyFields > 0) || (numAppendFields > 0); 357 auto filterStream = (cmdopt.filterFile == "-") ? stdin : cmdopt.filterFile.File; 358 foreach (lineNum, line; filterStream.byLine.enumerate(1)) 359 { 360 debug writeln("[filter line] |", line, "|"); 361 if (needPerFieldProcessing) 362 { 363 filterKeysReordering.initNewLine; 364 appendFieldsReordering.initNewLine; 365 366 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 367 { 368 filterKeysReordering.processNextField(fieldIndex,fieldValue); 369 appendFieldsReordering.processNextField(fieldIndex,fieldValue); 370 371 if (filterKeysReordering.allFieldsFilled && appendFieldsReordering.allFieldsFilled) 372 { 373 break; 374 } 375 } 376 // Processed all fields in the line. 377 if (!filterKeysReordering.allFieldsFilled || !appendFieldsReordering.allFieldsFilled) 378 { 379 throw new Exception( 380 format("Not enough fields in line. File: %s, Line: %s", 381 (cmdopt.filterFile == "-") ? "Standard Input" : cmdopt.filterFile, lineNum)); 382 } 383 } 384 385 string key = cmdopt.keyIsFullLine ? 386 line.to!string : filterKeysReordering.outputFields.join(cmdopt.delim).to!string; 387 string appendValues = cmdopt.appendFullLine ? 388 line.to!string : appendFieldsReordering.outputFields.join(cmdopt.delim).to!string; 389 390 debug writeln(" --> [key]:[append] => [", key, "]:[", appendValues, "]"); 391 392 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, cmdopt.filterFile, lineNum); 393 394 if (lineNum == 1 && cmdopt.hasHeader) 395 { 396 if (cmdopt.appendHeaderPrefix.length == 0) 397 { 398 appendFieldsHeader = appendValues; 399 } 400 else 401 { 402 foreach (fieldIndex, fieldValue; appendValues.splitter(cmdopt.delim).enumerate) 403 { 404 if (fieldIndex > 0) appendFieldsHeader ~= cmdopt.delim; 405 appendFieldsHeader ~= cmdopt.appendHeaderPrefix; 406 appendFieldsHeader ~= fieldValue; 407 } 408 } 409 } 410 else 411 { 412 if (isAppending && !cmdopt.allowDupliateKeys) 413 { 414 string* currAppendValues = (key in filterHash); 415 if (currAppendValues !is null && *currAppendValues != appendValues) 416 { 417 throw new Exception( 418 format("Duplicate keys with different append values (use --z|allow-duplicate-keys to ignore)\n [key 1][values]: [%s][%s]\n [key 2][values]: [%s][%s]", 419 key, *currAppendValues, key, appendValues)); 420 } 421 } 422 filterHash[key] = appendValues; 423 } 424 } 425 } 426 427 filterHash.rehash; // For faster lookups. (Per docs. In my tests no performance delta.) 428 429 /* Now process each input file, one line at a time. */ 430 431 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 432 bool headerWritten = false; 433 434 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 435 { 436 auto inputStream = (filename == "-") ? stdin : filename.File(); 437 foreach (lineNum, line; inputStream.byLine.enumerate(1)) 438 { 439 debug writeln("[input line] |", line, "|"); 440 441 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum); 442 443 if (lineNum == 1 && cmdopt.hasHeader) 444 { 445 /* Header line processing. */ 446 if (!headerWritten) 447 { 448 bufferedOutput.append(line); 449 if (isAppending) 450 { 451 bufferedOutput.append(cmdopt.delim); 452 bufferedOutput.append(appendFieldsHeader); 453 } 454 bufferedOutput.appendln; 455 headerWritten = true; 456 } 457 } 458 else 459 { 460 /* Regular line (not a header line). 461 * 462 * Next block checks if the input line matches a hash entry. Two cases: 463 * a) The whole line is the key. Simply look it up in the hash. 464 * b) Individual fields are used as the key - Assemble key and look it up. 465 * 466 * At the end of the appendFields will contain the result of hash lookup. 467 */ 468 string* appendFields; 469 if (cmdopt.keyIsFullLine) 470 { 471 appendFields = (line in filterHash); 472 } 473 else 474 { 475 dataKeysReordering.initNewLine; 476 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 477 { 478 dataKeysReordering.processNextField(fieldIndex, fieldValue); 479 if (dataKeysReordering.allFieldsFilled) break; 480 } 481 // Processed all fields in the line. 482 if (!dataKeysReordering.allFieldsFilled) 483 { 484 throw new Exception( 485 format("Not enough fields in line. File: %s, Line: %s", 486 (filename == "-") ? "Standard Input" : filename, lineNum)); 487 } 488 appendFields = (dataKeysReordering.outputFields.join(cmdopt.delim) in filterHash); 489 } 490 491 bool matched = (appendFields !is null); 492 debug writeln(" --> matched? ", matched); 493 if (cmdopt.writeAll || (matched && !cmdopt.exclude) || (!matched && cmdopt.exclude)) 494 { 495 bufferedOutput.append(line); 496 if (isAppending) 497 { 498 bufferedOutput.append(cmdopt.delim); 499 bufferedOutput.append(matched ? *appendFields : appendFieldsUnmatchedValue); 500 } 501 bufferedOutput.appendln(); 502 } 503 } 504 } 505 } 506 }