tsv_join source code

1 /**
2 Command line tool that joins tab-separated value files based on a common key.
3 
4 This tool joins lines from tab-delimited files based on a common key. One file, the 'filter'
5 file, contains the records (lines) being matched. The other input files are searched for
6 matching records. Matching records are written to standard output, along with any designated
7 fields from the 'filter' file. In database parlance this is a 'hash semi-join'.
8 
9 Copyright (c) 2015-2018, eBay Software Foundation
10 Initially written by Jon Degenhardt
11 
12 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
13 */
14 module tsv_join;
15 
16 import std.stdio;
17 import std.format : format;
18 import std.typecons : tuple;
19 
20 auto helpText = q"EOS
21 Synopsis: tsv-join --filter-file file [options] file [file...]
22 
23 tsv-join matches input lines against lines from a 'filter' file. The match is
24 based on fields or the entire line. Use '--help-verbose' for more details.
25 
26 Options:
27 EOS";
28 
29 auto helpTextVerbose = q"EOS
30 Synopsis: tsv-join --filter-file file [options] file [file...]
31 
32 tsv-join matches input lines against lines from a 'filter' file. The match is
33 based on exact match comparison of one or more 'key' fields. Fields are TAB
34 delimited by default. Matching lines are written to standard output, along with
35 any additional fields from the key file that have been specified. An example:
36 
37   tsv-join --filter-file filter.tsv --key-fields 1 --append-fields 5,6 data.tsv
38 
39 This reads filter.tsv, creating a hash table keyed on field 1. Lines from data.tsv
40 are read one at a time. If field 1 is found in the hash table, the line is written
41 to standard output with fields 5 and 6 from the filter file appended. In database
42 parlance this is a "hash semi join". Note the asymmetric relationship: Records in
43 the filter file should be unique, but data.tsv lines can repeat.
44 
45 tsv-join can also work as a simple filter, this is the default behavior. Example:
46 
47   tsv-join --filter-file filter.tsv data.tsv
48 
49 This outputs all lines from data.tsv found in filter.tsv. --key-fields can still
50 be used to define the match key. The --exclude option can be used to exclude
51 matched lines rather than keep them.
52 
53 Multiple fields can be specified as keys and append fields. Field numbers start
54 at one, zero represents the whole line. Fields are comma separated and ranges
55 can be used. Example:
56 
57   tsv-join -f filter.tsv -k 1,2 --append-fields 3-7 data.tsv
58 
59 Options:
60 EOS";
61 
62 /** Container for command line options.
63  */
64 struct TsvJoinOptions
65 {
66     string programName;
67     string filterFile;               // --filter
68     size_t[] keyFields;              // --key-fields
69     size_t[] dataFields;             // --data-fields
70     size_t[] appendFields;           // --append-fields
71     bool hasHeader = false;          // --H|header
72     string appendHeaderPrefix = "";  // --append-header-prefix
73     bool writeAll = false;           // --write-all
74     string writeAllValue;            // --write-all
75     bool exclude = false;            // --exclude
76     char delim = '\t';               // --delimiter
77     bool helpVerbose = false;        // --help-verbose
78     bool versionWanted = false;      // --V|version
79     bool allowDupliateKeys = false;  // --allow-duplicate-keys
80     bool keyIsFullLine = false;      // Derived: --key-fields 0
81     bool dataIsFullLine = false;     // Derived: --data-fields 0
82     bool appendFullLine = false;     // Derived: --append-fields 0
83 
84     /* Returns a tuple. First value is true if command line arguments were successfully
85      * processed and execution should continue, or false if an error occurred or the user
86      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
87      *
88      * Returning true (execution continues) means args have been validated and derived
89      * values calculated. In addition, field indices have been converted to zero-based.
90      * If the whole line is the key, the individual fields lists will be cleared.
91      */
92     auto processArgs (ref string[] cmdArgs)
93     {
94         import std.algorithm : any, each;
95         import std.getopt;
96         import std.path : baseName, stripExtension;
97         import std.typecons : Yes, No;
98         import tsvutil :  makeFieldListOptionHandler;
99 
100         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
101 
102         /* Handler for --write-all. Special handler so two values can be set. */
103         void writeAllHandler(string option, string value)
104         {
105             debug stderr.writeln("[writeAllHandler] |", option, "|  |", value, "|");
106             writeAll = true;
107             writeAllValue = value;
108         }
109 
110         try
111         {
112             arraySep = ",";    // Use comma to separate values in command line options
113             auto r = getopt(
114                 cmdArgs,
115                 "help-verbose",    "              Print full help.", &helpVerbose,
116                 "f|filter-file",   "FILE          (Required) File with records to use as a filter.", &filterFile,
117 
118                 "k|key-fields",    "<field-list>  Fields to use as join key. Default: 0 (entire line).",
119                 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
120 
121                 "d|data-fields",   "<field-list>  Data record fields to use as join key, if different than --key-fields.",
122                 dataFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
123 
124                 "a|append-fields", "<field-list>  Filter fields to append to matched records.",
125                 appendFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
126 
127                 std.getopt.config.caseSensitive,
128                 "H|header",        "              Treat the first line of each file as a header.", &hasHeader,
129                 std.getopt.config.caseInsensitive,
130                 "p|prefix",        "STR           String to use as a prefix for --append-fields when writing a header line.", &appendHeaderPrefix,
131                 "w|write-all",     "STR           Output all data records. STR is the --append-fields value when writing unmatched records.", &writeAllHandler,
132                 "e|exclude",       "              Exclude matching records.", &exclude,
133                 "delimiter",       "CHR           Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
134                 "z|allow-duplicate-keys",
135                                    "              Allow duplicate keys with different append values (last entry wins).", &allowDupliateKeys,
136                 std.getopt.config.caseSensitive,
137                 "V|version",       "              Print version information and exit.", &versionWanted,
138                 std.getopt.config.caseInsensitive,
139                 );
140 
141             if (r.helpWanted)
142             {
143                 defaultGetoptPrinter(helpText, r.options);
144                 return tuple(false, 0);
145             }
146             else if (helpVerbose)
147             {
148                 defaultGetoptPrinter(helpTextVerbose, r.options);
149                 return tuple(false, 0);
150             }
151             else if (versionWanted)
152             {
153                 import tsvutils_version;
154                 writeln(tsvutilsVersionNotice("tsv-join"));
155                 return tuple(false, 0);
156             }
157 
158             consistencyValidations(cmdArgs);
159             derivations();
160         }
161         catch (Exception exc)
162         {
163             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
164             return tuple(false, 1);
165         }
166         return tuple(true, 0);
167     }
168 
169     /* This routine does validations not handled by getopt, usually because they
170      * involve interactions between multiple parameters.
171      */
172     private void consistencyValidations(ref string[] processedCmdArgs)
173     {
174         import std.algorithm : any;
175 
176         if (filterFile.length == 0)
177         {
178             throw new Exception("Required option --filter-file was not supplied.");
179         }
180         else if (filterFile == "-" && processedCmdArgs.length == 1)
181         {
182             throw new Exception("A data file is required when standard input is used for the filter file (--f|filter-file -).");
183         }
184 
185         if (writeAll && appendFields.length == 0)
186         {
187             throw new Exception("Use --a|append-fields when using --w|write-all.");
188         }
189 
190         if (writeAll && appendFields.length == 1 && appendFields[0] == 0)
191         {
192             throw new Exception("Cannot use '--a|append-fields 0' (whole line) when using --w|write-all.");
193         }
194 
195         if (appendFields.length > 0 && exclude)
196         {
197             throw new Exception("--e|exclude cannot be used with --a|append-fields.");
198         }
199 
200         if (appendHeaderPrefix.length > 0 && !hasHeader)
201         {
202             throw new Exception("Use --header when using --p|prefix.");
203         }
204 
205         if (dataFields.length > 0 && keyFields.length != dataFields.length)
206         {
207             throw new Exception("Different number of --k|key-fields and --d|data-fields.");
208         }
209 
210         if (keyFields.length == 1 && dataFields.length == 1 &&
211             ((keyFields[0] == 0 && dataFields[0] != 0) || (keyFields[0] != 0 && dataFields[0] == 0)))
212         {
213             throw new Exception("If either --k|key-field or --d|data-field is zero both must be zero.");
214         }
215 
216         if ((keyFields.length > 1    && any!(a => a == 0)(keyFields)) ||
217             (dataFields.length > 1   && any!(a => a == 0)(dataFields)) ||
218             (appendFields.length > 1 && any!(a => a == 0)(appendFields)))
219         {
220             throw new Exception("Field 0 (whole line) cannot be combined with individual fields (non-zero).");
221         }
222 
223     }
224 
225     /* Post-processing derivations. */
226     void derivations()
227     {
228         import std.algorithm : each;
229         import std.range;
230 
231         // Convert 'full-line' field indexes (index zero) to boolean flags.
232         if (keyFields.length == 0)
233         {
234             assert(dataFields.length == 0);
235             keyIsFullLine = true;
236             dataIsFullLine = true;
237         }
238         else if (keyFields.length == 1 && keyFields[0] == 0)
239         {
240             keyIsFullLine = true;
241             keyFields.popFront;
242             dataIsFullLine = true;
243 
244             if (dataFields.length == 1)
245             {
246                 assert(dataFields[0] == 0);
247                 dataFields.popFront;
248             }
249         }
250 
251         if (appendFields.length == 1 && appendFields[0] == 0)
252         {
253             appendFullLine = true;
254             appendFields.popFront;
255         }
256 
257         assert(!(keyIsFullLine && keyFields.length > 0));
258         assert(!(dataIsFullLine && dataFields.length > 0));
259         assert(!(appendFullLine && appendFields.length > 0));
260 
261         // Switch to zero-based field indexes.
262         keyFields.each!((ref a) => --a);
263         dataFields.each!((ref a) => --a);
264         appendFields.each!((ref a) => --a);
265     }
266 }
267 
268 /**
269 Main program.
270  */
271 int main(string[] cmdArgs)
272 {
273     /* When running in DMD code coverage mode, turn on report merging. */
274     version(D_Coverage) version(DigitalMars)
275     {
276         import core.runtime : dmd_coverSetMerge;
277         dmd_coverSetMerge(true);
278     }
279 
280     TsvJoinOptions cmdopt;
281     auto r = cmdopt.processArgs(cmdArgs);
282     if (!r[0]) return r[1];
283     try tsvJoin(cmdopt, cmdArgs[1..$]);
284     catch (Exception exc)
285     {
286         stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
287         return 1;
288     }
289     return 0;
290 }
291 
292 /** tsvJoin does the primary work of the tsv-join program.
293  */
294 void tsvJoin(in TsvJoinOptions cmdopt, in string[] inputFiles)
295 {
296     import tsvutil : InputFieldReordering, BufferedOutputRange, throwIfWindowsNewlineOnUnix;
297     import std.algorithm : splitter;
298     import std.array : join;
299     import std.range;
300     import std.conv : to;
301 
302     /* State, variables, and convenience derivations.
303      *
304      * Combinations of individual fields and whole line (field zero) are convenient for the
305      * user, but create complexities for the program. Many combinations are disallowed by
306      * command line processing, but the remaining combos still leave several states. Also,
307      * this code optimizes by doing only necessary operations, further complicating state
308      * Here's a guide to variables and state.
309      * - cmdopt.keyFields, cmdopt.dataFields arrays - Individual field indexes used as keys.
310      *      Empty if the  whole line is used as a key. Must be the same length.
311      * - cmdopt.keyIsFullLine, cmdopt.dataIsFullLine - True when the whole line is used key.
312      * - cmdopt.appendFields array - Indexes of individual filter file fields being appended.
313      *      Empty if appending the full line, or if not appending anything.
314      * - cmdopt.appendFullLine - True when the whole line is being appended.
315      * - isAppending - True is something is being appended.
316      * - cmdopt.writeAll - True if all lines are being written
317      */
318     /* Convenience derivations. */
319     auto numKeyFields = cmdopt.keyFields.length;
320     auto numAppendFields = cmdopt.appendFields.length;
321     bool isAppending = (cmdopt.appendFullLine || numAppendFields > 0);
322 
323     /* Mappings from field indexes in the input lines to collection arrays. */
324     auto filterKeysReordering = new InputFieldReordering!char(cmdopt.keyFields);
325     auto dataKeysReordering = (cmdopt.dataFields.length == 0) ?
326         filterKeysReordering : new InputFieldReordering!char(cmdopt.dataFields);
327     auto appendFieldsReordering = new InputFieldReordering!char(cmdopt.appendFields);
328 
329     /* The master filter hash. The key is the delimited fields concatenated together
330      * (including separators). The value is the appendFields concatenated together, as
331      * they will be appended to the input line. Both the keys and append fields are
332      * assembled in the order specified, though this only required for append fields.
333      */
334     string[string] filterHash;
335     string appendFieldsHeader;
336 
337     /* The append values for unmatched records. */
338     char[] appendFieldsUnmatchedValue;
339 
340     if (cmdopt.writeAll)
341     {
342         assert(cmdopt.appendFields.length > 0);  // Checked in consistencyValidations
343 
344         // reserve space for n values and n-1 delimiters
345         appendFieldsUnmatchedValue.reserve(cmdopt.appendFields.length * (cmdopt.writeAllValue.length + 1) - 1);
346 
347         appendFieldsUnmatchedValue ~= cmdopt.writeAllValue;
348         for (size_t i = 1; i < cmdopt.appendFields.length; ++i)
349         {
350             appendFieldsUnmatchedValue ~= cmdopt.delim;
351             appendFieldsUnmatchedValue ~= cmdopt.writeAllValue;
352         }
353     }
354 
355     /* Read the filter file. */
356     {
357         bool needPerFieldProcessing = (numKeyFields > 0) || (numAppendFields > 0);
358         auto filterStream = (cmdopt.filterFile == "-") ? stdin : cmdopt.filterFile.File;
359         foreach (lineNum, line; filterStream.byLine.enumerate(1))
360         {
361             debug writeln("[filter line] |", line, "|");
362             if (needPerFieldProcessing)
363             {
364                 filterKeysReordering.initNewLine;
365                 appendFieldsReordering.initNewLine;
366 
367                 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
368                 {
369                     filterKeysReordering.processNextField(fieldIndex,fieldValue);
370                     appendFieldsReordering.processNextField(fieldIndex,fieldValue);
371 
372                     if (filterKeysReordering.allFieldsFilled && appendFieldsReordering.allFieldsFilled)
373                     {
374                         break;
375                     }
376                 }
377                 // Processed all fields in the line.
378                 if (!filterKeysReordering.allFieldsFilled || !appendFieldsReordering.allFieldsFilled)
379                 {
380                     throw new Exception(
381                         format("Not enough fields in line. File: %s, Line: %s",
382                                (cmdopt.filterFile == "-") ? "Standard Input" : cmdopt.filterFile, lineNum));
383                 }
384             }
385 
386             string key = cmdopt.keyIsFullLine ?
387                 line.to!string : filterKeysReordering.outputFields.join(cmdopt.delim).to!string;
388             string appendValues = cmdopt.appendFullLine ?
389                 line.to!string : appendFieldsReordering.outputFields.join(cmdopt.delim).to!string;
390 
391             debug writeln("  --> [key]:[append] => [", key, "]:[", appendValues, "]");
392 
393             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, cmdopt.filterFile, lineNum);
394 
395             if (lineNum == 1 && cmdopt.hasHeader)
396             {
397                 if (cmdopt.appendHeaderPrefix.length == 0)
398                 {
399                     appendFieldsHeader = appendValues;
400                 }
401                 else
402                 {
403                     foreach (fieldIndex, fieldValue; appendValues.splitter(cmdopt.delim).enumerate)
404                     {
405                         if (fieldIndex > 0) appendFieldsHeader ~= cmdopt.delim;
406                         appendFieldsHeader ~= cmdopt.appendHeaderPrefix;
407                         appendFieldsHeader ~= fieldValue;
408                     }
409                 }
410             }
411             else
412             {
413                 if (isAppending && !cmdopt.allowDupliateKeys)
414                 {
415                     string* currAppendValues = (key in filterHash);
416                     if (currAppendValues !is null && *currAppendValues != appendValues)
417                     {
418                         throw new Exception(
419                             format("Duplicate keys with different append values (use --z|allow-duplicate-keys to ignore)\n   [key 1][values]: [%s][%s]\n   [key 2][values]: [%s][%s]",
420                                    key, *currAppendValues, key, appendValues));
421                     }
422                 }
423                 filterHash[key] = appendValues;
424             }
425         }
426     }
427 
428     filterHash.rehash;    // For faster lookups. (Per docs. In my tests no performance delta.)
429 
430     /* Now process each input file, one line at a time. */
431 
432     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
433     bool headerWritten = false;
434 
435     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
436     {
437         auto inputStream = (filename == "-") ? stdin : filename.File();
438         foreach (lineNum, line; inputStream.byLine.enumerate(1))
439         {
440             debug writeln("[input line] |", line, "|");
441 
442             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum);
443 
444             if (lineNum == 1 && cmdopt.hasHeader)
445             {
446                 /* Header line processing. */
447                 if (!headerWritten)
448                 {
449                     bufferedOutput.append(line);
450                     if (isAppending)
451                     {
452                         bufferedOutput.append(cmdopt.delim);
453                         bufferedOutput.append(appendFieldsHeader);
454                     }
455                     bufferedOutput.appendln;
456                     headerWritten = true;
457                 }
458             }
459             else
460             {
461                 /* Regular line (not a header line).
462                  *
463                  * Next block checks if the input line matches a hash entry. Two cases:
464                  *   a) The whole line is the key. Simply look it up in the hash.
465                  *   b) Individual fields are used as the key - Assemble key and look it up.
466                  *
467                  * At the end of the appendFields will contain the result of hash lookup.
468                  */
469                 string* appendFields;
470                 if (cmdopt.keyIsFullLine)
471                 {
472                     appendFields = (line in filterHash);
473                 }
474                 else
475                 {
476                     dataKeysReordering.initNewLine;
477                     foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
478                     {
479                         dataKeysReordering.processNextField(fieldIndex, fieldValue);
480                         if (dataKeysReordering.allFieldsFilled) break;
481                     }
482                     // Processed all fields in the line.
483                     if (!dataKeysReordering.allFieldsFilled)
484                     {
485                         throw new Exception(
486                             format("Not enough fields in line. File: %s, Line: %s",
487                                    (filename == "-") ? "Standard Input" : filename, lineNum));
488                     }
489                     appendFields = (dataKeysReordering.outputFields.join(cmdopt.delim) in filterHash);
490                 }
491 
492                 bool matched = (appendFields !is null);
493                 debug writeln("   --> matched? ", matched);
494                 if (cmdopt.writeAll || (matched && !cmdopt.exclude) || (!matched && cmdopt.exclude))
495                 {
496                     bufferedOutput.append(line);
497                     if (isAppending)
498                     {
499                         bufferedOutput.append(cmdopt.delim);
500                         bufferedOutput.append(matched ? *appendFields : appendFieldsUnmatchedValue);
501                     }
502                     bufferedOutput.appendln();
503                 }
504             }
505         }
506     }
507 }