1 /** 2 Command line tool that joins tab-separated value files based on a common key. 3 4 This tool joins lines from tab-delimited files based on a common key. One file, the 'filter' 5 file, contains the records (lines) being matched. The other input files are searched for 6 matching records. Matching records are written to standard output, along with any designated 7 fields from the 'filter' file. In database parlance this is a 'hash semi-join'. 8 9 Copyright (c) 2015-2018, eBay Software Foundation 10 Initially written by Jon Degenhardt 11 12 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 13 */ 14 module tsv_join; 15 16 import std.stdio; 17 import std.format : format; 18 import std.typecons : tuple; 19 20 auto helpText = q"EOS 21 Synopsis: tsv-join --filter-file file [options] file [file...] 22 23 tsv-join matches input lines against lines from a 'filter' file. The match is 24 based on fields or the entire line. Use '--help-verbose' for more details. 25 26 Options: 27 EOS"; 28 29 auto helpTextVerbose = q"EOS 30 Synopsis: tsv-join --filter-file file [options] file [file...] 31 32 tsv-join matches input lines against lines from a 'filter' file. The match is 33 based on exact match comparison of one or more 'key' fields. Fields are TAB 34 delimited by default. Matching lines are written to standard output, along with 35 any additional fields from the key file that have been specified. An example: 36 37 tsv-join --filter-file filter.tsv --key-fields 1 --append-fields 5,6 data.tsv 38 39 This reads filter.tsv, creating a hash table keyed on field 1. Lines from data.tsv 40 are read one at a time. If field 1 is found in the hash table, the line is written 41 to standard output with fields 5 and 6 from the filter file appended. In database 42 parlance this is a "hash semi join". Note the asymmetric relationship: Records in 43 the filter file should be unique, but data.tsv lines can repeat. 44 45 tsv-join can also work as a simple filter, this is the default behavior. Example: 46 47 tsv-join --filter-file filter.tsv data.tsv 48 49 This outputs all lines from data.tsv found in filter.tsv. --key-fields can still 50 be used to define the match key. The --exclude option can be used to exclude 51 matched lines rather than keep them. 52 53 Multiple fields can be specified as keys and append fields. Field numbers start 54 at one, zero represents the whole line. Fields are comma separated and ranges 55 can be used. Example: 56 57 tsv-join -f filter.tsv -k 1,2 --append-fields 3-7 data.tsv 58 59 Options: 60 EOS"; 61 62 /** Container for command line options. 63 */ 64 struct TsvJoinOptions 65 { 66 string programName; 67 string filterFile; // --filter 68 size_t[] keyFields; // --key-fields 69 size_t[] dataFields; // --data-fields 70 size_t[] appendFields; // --append-fields 71 bool hasHeader = false; // --H|header 72 string appendHeaderPrefix = ""; // --append-header-prefix 73 bool writeAll = false; // --write-all 74 string writeAllValue; // --write-all 75 bool exclude = false; // --exclude 76 char delim = '\t'; // --delimiter 77 bool helpVerbose = false; // --help-verbose 78 bool versionWanted = false; // --V|version 79 bool allowDupliateKeys = false; // --allow-duplicate-keys 80 bool keyIsFullLine = false; // Derived: --key-fields 0 81 bool dataIsFullLine = false; // Derived: --data-fields 0 82 bool appendFullLine = false; // Derived: --append-fields 0 83 84 /* Returns a tuple. First value is true if command line arguments were successfully 85 * processed and execution should continue, or false if an error occurred or the user 86 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 87 * 88 * Returning true (execution continues) means args have been validated and derived 89 * values calculated. In addition, field indices have been converted to zero-based. 90 * If the whole line is the key, the individual fields lists will be cleared. 91 */ 92 auto processArgs (ref string[] cmdArgs) 93 { 94 import std.algorithm : any, each; 95 import std.getopt; 96 import std.path : baseName, stripExtension; 97 import std.typecons : Yes, No; 98 import tsvutil : makeFieldListOptionHandler; 99 100 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 101 102 /* Handler for --write-all. Special handler so two values can be set. */ 103 void writeAllHandler(string option, string value) 104 { 105 debug stderr.writeln("[writeAllHandler] |", option, "| |", value, "|"); 106 writeAll = true; 107 writeAllValue = value; 108 } 109 110 try 111 { 112 arraySep = ","; // Use comma to separate values in command line options 113 auto r = getopt( 114 cmdArgs, 115 "help-verbose", " Print full help.", &helpVerbose, 116 "f|filter-file", "FILE (Required) File with records to use as a filter.", &filterFile, 117 118 "k|key-fields", "<field-list> Fields to use as join key. Default: 0 (entire line).", 119 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero), 120 121 "d|data-fields", "<field-list> Data record fields to use as join key, if different than --key-fields.", 122 dataFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero), 123 124 "a|append-fields", "<field-list> Filter fields to append to matched records.", 125 appendFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero), 126 127 std.getopt.config.caseSensitive, 128 "H|header", " Treat the first line of each file as a header.", &hasHeader, 129 std.getopt.config.caseInsensitive, 130 "p|prefix", "STR String to use as a prefix for --append-fields when writing a header line.", &appendHeaderPrefix, 131 "w|write-all", "STR Output all data records. STR is the --append-fields value when writing unmatched records.", &writeAllHandler, 132 "e|exclude", " Exclude matching records.", &exclude, 133 "delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 134 "z|allow-duplicate-keys", 135 " Allow duplicate keys with different append values (last entry wins).", &allowDupliateKeys, 136 std.getopt.config.caseSensitive, 137 "V|version", " Print version information and exit.", &versionWanted, 138 std.getopt.config.caseInsensitive, 139 ); 140 141 if (r.helpWanted) 142 { 143 defaultGetoptPrinter(helpText, r.options); 144 return tuple(false, 0); 145 } 146 else if (helpVerbose) 147 { 148 defaultGetoptPrinter(helpTextVerbose, r.options); 149 return tuple(false, 0); 150 } 151 else if (versionWanted) 152 { 153 import tsvutils_version; 154 writeln(tsvutilsVersionNotice("tsv-join")); 155 return tuple(false, 0); 156 } 157 158 consistencyValidations(cmdArgs); 159 derivations(); 160 } 161 catch (Exception exc) 162 { 163 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 164 return tuple(false, 1); 165 } 166 return tuple(true, 0); 167 } 168 169 /* This routine does validations not handled by getopt, usually because they 170 * involve interactions between multiple parameters. 171 */ 172 private void consistencyValidations(ref string[] processedCmdArgs) 173 { 174 import std.algorithm : any; 175 176 if (filterFile.length == 0) 177 { 178 throw new Exception("Required option --filter-file was not supplied."); 179 } 180 else if (filterFile == "-" && processedCmdArgs.length == 1) 181 { 182 throw new Exception("A data file is required when standard input is used for the filter file (--f|filter-file -)."); 183 } 184 185 if (writeAll && appendFields.length == 0) 186 { 187 throw new Exception("Use --a|append-fields when using --w|write-all."); 188 } 189 190 if (writeAll && appendFields.length == 1 && appendFields[0] == 0) 191 { 192 throw new Exception("Cannot use '--a|append-fields 0' (whole line) when using --w|write-all."); 193 } 194 195 if (appendFields.length > 0 && exclude) 196 { 197 throw new Exception("--e|exclude cannot be used with --a|append-fields."); 198 } 199 200 if (appendHeaderPrefix.length > 0 && !hasHeader) 201 { 202 throw new Exception("Use --header when using --p|prefix."); 203 } 204 205 if (dataFields.length > 0 && keyFields.length != dataFields.length) 206 { 207 throw new Exception("Different number of --k|key-fields and --d|data-fields."); 208 } 209 210 if (keyFields.length == 1 && dataFields.length == 1 && 211 ((keyFields[0] == 0 && dataFields[0] != 0) || (keyFields[0] != 0 && dataFields[0] == 0))) 212 { 213 throw new Exception("If either --k|key-field or --d|data-field is zero both must be zero."); 214 } 215 216 if ((keyFields.length > 1 && any!(a => a == 0)(keyFields)) || 217 (dataFields.length > 1 && any!(a => a == 0)(dataFields)) || 218 (appendFields.length > 1 && any!(a => a == 0)(appendFields))) 219 { 220 throw new Exception("Field 0 (whole line) cannot be combined with individual fields (non-zero)."); 221 } 222 223 } 224 225 /* Post-processing derivations. */ 226 void derivations() 227 { 228 import std.algorithm : each; 229 import std.range; 230 231 // Convert 'full-line' field indexes (index zero) to boolean flags. 232 if (keyFields.length == 0) 233 { 234 assert(dataFields.length == 0); 235 keyIsFullLine = true; 236 dataIsFullLine = true; 237 } 238 else if (keyFields.length == 1 && keyFields[0] == 0) 239 { 240 keyIsFullLine = true; 241 keyFields.popFront; 242 dataIsFullLine = true; 243 244 if (dataFields.length == 1) 245 { 246 assert(dataFields[0] == 0); 247 dataFields.popFront; 248 } 249 } 250 251 if (appendFields.length == 1 && appendFields[0] == 0) 252 { 253 appendFullLine = true; 254 appendFields.popFront; 255 } 256 257 assert(!(keyIsFullLine && keyFields.length > 0)); 258 assert(!(dataIsFullLine && dataFields.length > 0)); 259 assert(!(appendFullLine && appendFields.length > 0)); 260 261 // Switch to zero-based field indexes. 262 keyFields.each!((ref a) => --a); 263 dataFields.each!((ref a) => --a); 264 appendFields.each!((ref a) => --a); 265 } 266 } 267 268 /** 269 Main program. 270 */ 271 int main(string[] cmdArgs) 272 { 273 /* When running in DMD code coverage mode, turn on report merging. */ 274 version(D_Coverage) version(DigitalMars) 275 { 276 import core.runtime : dmd_coverSetMerge; 277 dmd_coverSetMerge(true); 278 } 279 280 TsvJoinOptions cmdopt; 281 auto r = cmdopt.processArgs(cmdArgs); 282 if (!r[0]) return r[1]; 283 try tsvJoin(cmdopt, cmdArgs[1..$]); 284 catch (Exception exc) 285 { 286 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 287 return 1; 288 } 289 return 0; 290 } 291 292 /** tsvJoin does the primary work of the tsv-join program. 293 */ 294 void tsvJoin(in TsvJoinOptions cmdopt, in string[] inputFiles) 295 { 296 import tsvutil : InputFieldReordering, BufferedOutputRange, throwIfWindowsNewlineOnUnix; 297 import std.algorithm : splitter; 298 import std.array : join; 299 import std.range; 300 import std.conv : to; 301 302 /* State, variables, and convenience derivations. 303 * 304 * Combinations of individual fields and whole line (field zero) are convenient for the 305 * user, but create complexities for the program. Many combinations are disallowed by 306 * command line processing, but the remaining combos still leave several states. Also, 307 * this code optimizes by doing only necessary operations, further complicating state 308 * Here's a guide to variables and state. 309 * - cmdopt.keyFields, cmdopt.dataFields arrays - Individual field indexes used as keys. 310 * Empty if the whole line is used as a key. Must be the same length. 311 * - cmdopt.keyIsFullLine, cmdopt.dataIsFullLine - True when the whole line is used key. 312 * - cmdopt.appendFields array - Indexes of individual filter file fields being appended. 313 * Empty if appending the full line, or if not appending anything. 314 * - cmdopt.appendFullLine - True when the whole line is being appended. 315 * - isAppending - True is something is being appended. 316 * - cmdopt.writeAll - True if all lines are being written 317 */ 318 /* Convenience derivations. */ 319 auto numKeyFields = cmdopt.keyFields.length; 320 auto numAppendFields = cmdopt.appendFields.length; 321 bool isAppending = (cmdopt.appendFullLine || numAppendFields > 0); 322 323 /* Mappings from field indexes in the input lines to collection arrays. */ 324 auto filterKeysReordering = new InputFieldReordering!char(cmdopt.keyFields); 325 auto dataKeysReordering = (cmdopt.dataFields.length == 0) ? 326 filterKeysReordering : new InputFieldReordering!char(cmdopt.dataFields); 327 auto appendFieldsReordering = new InputFieldReordering!char(cmdopt.appendFields); 328 329 /* The master filter hash. The key is the delimited fields concatenated together 330 * (including separators). The value is the appendFields concatenated together, as 331 * they will be appended to the input line. Both the keys and append fields are 332 * assembled in the order specified, though this only required for append fields. 333 */ 334 string[string] filterHash; 335 string appendFieldsHeader; 336 337 /* The append values for unmatched records. */ 338 char[] appendFieldsUnmatchedValue; 339 340 if (cmdopt.writeAll) 341 { 342 assert(cmdopt.appendFields.length > 0); // Checked in consistencyValidations 343 344 // reserve space for n values and n-1 delimiters 345 appendFieldsUnmatchedValue.reserve(cmdopt.appendFields.length * (cmdopt.writeAllValue.length + 1) - 1); 346 347 appendFieldsUnmatchedValue ~= cmdopt.writeAllValue; 348 for (size_t i = 1; i < cmdopt.appendFields.length; ++i) 349 { 350 appendFieldsUnmatchedValue ~= cmdopt.delim; 351 appendFieldsUnmatchedValue ~= cmdopt.writeAllValue; 352 } 353 } 354 355 /* Read the filter file. */ 356 { 357 bool needPerFieldProcessing = (numKeyFields > 0) || (numAppendFields > 0); 358 auto filterStream = (cmdopt.filterFile == "-") ? stdin : cmdopt.filterFile.File; 359 foreach (lineNum, line; filterStream.byLine.enumerate(1)) 360 { 361 debug writeln("[filter line] |", line, "|"); 362 if (needPerFieldProcessing) 363 { 364 filterKeysReordering.initNewLine; 365 appendFieldsReordering.initNewLine; 366 367 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 368 { 369 filterKeysReordering.processNextField(fieldIndex,fieldValue); 370 appendFieldsReordering.processNextField(fieldIndex,fieldValue); 371 372 if (filterKeysReordering.allFieldsFilled && appendFieldsReordering.allFieldsFilled) 373 { 374 break; 375 } 376 } 377 // Processed all fields in the line. 378 if (!filterKeysReordering.allFieldsFilled || !appendFieldsReordering.allFieldsFilled) 379 { 380 throw new Exception( 381 format("Not enough fields in line. File: %s, Line: %s", 382 (cmdopt.filterFile == "-") ? "Standard Input" : cmdopt.filterFile, lineNum)); 383 } 384 } 385 386 string key = cmdopt.keyIsFullLine ? 387 line.to!string : filterKeysReordering.outputFields.join(cmdopt.delim).to!string; 388 string appendValues = cmdopt.appendFullLine ? 389 line.to!string : appendFieldsReordering.outputFields.join(cmdopt.delim).to!string; 390 391 debug writeln(" --> [key]:[append] => [", key, "]:[", appendValues, "]"); 392 393 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, cmdopt.filterFile, lineNum); 394 395 if (lineNum == 1 && cmdopt.hasHeader) 396 { 397 if (cmdopt.appendHeaderPrefix.length == 0) 398 { 399 appendFieldsHeader = appendValues; 400 } 401 else 402 { 403 foreach (fieldIndex, fieldValue; appendValues.splitter(cmdopt.delim).enumerate) 404 { 405 if (fieldIndex > 0) appendFieldsHeader ~= cmdopt.delim; 406 appendFieldsHeader ~= cmdopt.appendHeaderPrefix; 407 appendFieldsHeader ~= fieldValue; 408 } 409 } 410 } 411 else 412 { 413 if (isAppending && !cmdopt.allowDupliateKeys) 414 { 415 string* currAppendValues = (key in filterHash); 416 if (currAppendValues !is null && *currAppendValues != appendValues) 417 { 418 throw new Exception( 419 format("Duplicate keys with different append values (use --z|allow-duplicate-keys to ignore)\n [key 1][values]: [%s][%s]\n [key 2][values]: [%s][%s]", 420 key, *currAppendValues, key, appendValues)); 421 } 422 } 423 filterHash[key] = appendValues; 424 } 425 } 426 } 427 428 filterHash.rehash; // For faster lookups. (Per docs. In my tests no performance delta.) 429 430 /* Now process each input file, one line at a time. */ 431 432 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 433 bool headerWritten = false; 434 435 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 436 { 437 auto inputStream = (filename == "-") ? stdin : filename.File(); 438 foreach (lineNum, line; inputStream.byLine.enumerate(1)) 439 { 440 debug writeln("[input line] |", line, "|"); 441 442 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum); 443 444 if (lineNum == 1 && cmdopt.hasHeader) 445 { 446 /* Header line processing. */ 447 if (!headerWritten) 448 { 449 bufferedOutput.append(line); 450 if (isAppending) 451 { 452 bufferedOutput.append(cmdopt.delim); 453 bufferedOutput.append(appendFieldsHeader); 454 } 455 bufferedOutput.appendln; 456 headerWritten = true; 457 } 458 } 459 else 460 { 461 /* Regular line (not a header line). 462 * 463 * Next block checks if the input line matches a hash entry. Two cases: 464 * a) The whole line is the key. Simply look it up in the hash. 465 * b) Individual fields are used as the key - Assemble key and look it up. 466 * 467 * At the end of the appendFields will contain the result of hash lookup. 468 */ 469 string* appendFields; 470 if (cmdopt.keyIsFullLine) 471 { 472 appendFields = (line in filterHash); 473 } 474 else 475 { 476 dataKeysReordering.initNewLine; 477 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 478 { 479 dataKeysReordering.processNextField(fieldIndex, fieldValue); 480 if (dataKeysReordering.allFieldsFilled) break; 481 } 482 // Processed all fields in the line. 483 if (!dataKeysReordering.allFieldsFilled) 484 { 485 throw new Exception( 486 format("Not enough fields in line. File: %s, Line: %s", 487 (filename == "-") ? "Standard Input" : filename, lineNum)); 488 } 489 appendFields = (dataKeysReordering.outputFields.join(cmdopt.delim) in filterHash); 490 } 491 492 bool matched = (appendFields !is null); 493 debug writeln(" --> matched? ", matched); 494 if (cmdopt.writeAll || (matched && !cmdopt.exclude) || (!matched && cmdopt.exclude)) 495 { 496 bufferedOutput.append(line); 497 if (isAppending) 498 { 499 bufferedOutput.append(cmdopt.delim); 500 bufferedOutput.append(matched ? *appendFields : appendFieldsUnmatchedValue); 501 } 502 bufferedOutput.appendln(); 503 } 504 } 505 } 506 } 507 }