tsv_uniq source code

1 /**
2 Command line tool using fields in a tab-separated value file to identify equivalent
3 lines. Can either remove the duplicate entries or mark as equivalence classes.
4 
5 This tool reads a tab-separated value file line by line, using one or more fields to
6 record a key. If the same key is found in a subsequent line, it is identified as
7 equivalent. When operating in 'uniq' mode, the first time a key is seen the line is
8 written to standard output, but subsequent matching lines are discarded.
9 
10 The alternate to 'uniq' is 'equiv-class' identification. In this mode, all lines
11 written to standard output, but a new field is added marking equivalent entries with
12 with an ID. The ID is simply a one-upped counter.
13 
14 Copyright (c) 2015-2018, eBay Software Foundation
15 Initially written by Jon Degenhardt
16 
17 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
18 */
19 module tsv_uniq;
20 
21 import std.stdio;
22 import std.format : format;
23 import std.typecons : tuple;
24 
25 auto helpText = q"EOS
26 Synopsis: tsv-uniq [options] [file...]
27 
28 tsv-uniq filters out duplicate lines using fields as a key. Filtering is based
29 on the entire line if a key is not provided.
30 
31 Options:
32 EOS";
33 
34 auto helpTextVerbose = q"EOS
35 Synopsis: tsv-uniq [options] [file...]
36 
37 tsv-uniq identifies equivalent lines in tab-separated value files. Input is read
38 line by line, recording a key based on one or more of the fields. Two lines are
39 equivalent if they have the same key. When operating in 'uniq' mode, the first
40 time a key is seen the line is written to standard output, but subsequent lines
41 are discarded. This is similar to the unix 'uniq' program, but based on individual
42 fields and without requiring sorted data. This command uniq's on fields 2 and 3:
43 
44    tsv-uniq -f 2,3 file.tsv
45 
46 The alternate to 'uniq' mode is 'equiv-class' identification. In this mode, all
47 lines are written to standard output, but with a new field added marking
48 equivalent entries with an ID. The ID is simply a one-upped counter. Example:
49 
50    tsv-uniq --header -f 2,3 --equiv file.tsv
51 
52 tsv-uniq can be run without specifying a key field. In this case the whole line
53 is used as a key, same as the Unix 'uniq' program. This works on any line-oriented
54 text file, not just TSV files.
55 
56 The '--r|repeated' option can be used to print only lines occurring more than
57 once. '--a|at-least N' is similar, except that it only prints lines occuring at
58 least N times. For both, the Nth line found is printed, in the order found.
59 
60 The '--m|max MAX' option changes the behavior to output the first MAX lines for
61 each key, rather than just the first line for each key. This can also with used
62 with '--e|equiv' to limit the number output for each equivalence class.
63 
64 It's not obvious when both '--a|at-least' and '--m|max' might be useful, but, if
65 both are specified, the occurrences between 'at-least' and 'max' are output.
66 
67 Options:
68 EOS";
69 
70 /**
71 Container for command line options.
72  */
73 struct TsvUniqOptions
74 {
75     enum defaultEquivHeader = "equiv_id";
76     enum defaultEquivStartID = 1;
77 
78     string programName;
79     bool helpVerbose = false;                 // --h|help-verbose
80     bool versionWanted = false;               // --V|version
81     size_t[] fields;                          // --f|fields
82     bool hasHeader = false;                   // --H|header
83     bool onlyRepeated = false;                // --r|repeated. Shorthand for '--atleast 2'
84     size_t atLeast = 0;                       // --a|at-least. Zero implies default behavior.
85     size_t max = 0;                           // --m|max. Zero implies default behavior.
86     bool equivMode = false;                   // --e|equiv
87     string equivHeader = defaultEquivHeader;  // --equiv-header
88     long equivStartID = defaultEquivStartID;  // --equiv-start
89     bool ignoreCase = false;                  // --i|ignore-case
90     char delim = '\t';                        // --d|delimiter
91     bool keyIsFullLine = false;               // Derived. True if no fields specified or '--f|fields 0'
92 
93     /* Returns a tuple. First value is true if command line arguments were successfully
94      * processed and execution should continue, or false if an error occurred or the user
95      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
96      *
97      * Returning true (execution continues) means args have been validated and derived
98      * values calculated. In addition, field indices have been converted to zero-based.
99      * If the whole line is the key, the individual fields list will be cleared.
100      *
101      * Repeat count control variables 'atLeast' and max' - The general idea is that the
102      * values are left at zero if no repeat count options are specified. They are set if
103      * repeat count options are specified, as follows:
104      *   * atLeast - Will be zero unless --r|repeated or --a|at-least is specified.
105      *     --r|repeated option sets it 2, --a|at-least sets it to the specified value.
106      *   * max - Default to zero. Is set to the --m|max value if provided. Is set to
107      *    'atLeast' if --r|repeated or --a|at-least is provided.
108      *
109      * An exception to the above: If --e|equiv-mode is specified, then (max == 0)
110      * represents the default "output all values" case. In this case max may be less
111      * than the at-least value.
112      */
113     auto processArgs (ref string[] cmdArgs)
114     {
115         import std.algorithm : any, each;
116         import std.getopt;
117         import std.path : baseName, stripExtension;
118         import std.typecons : Yes, No;
119         import tsvutil :  makeFieldListOptionHandler;
120 
121         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
122 
123         try
124         {
125             arraySep = ",";    // Use comma to separate values in command line options
126             auto r = getopt(
127                 cmdArgs,
128                 "help-verbose",  "              Print full help.", &helpVerbose,
129                 std.getopt.config.caseSensitive,
130                 "V|version",     "              Print version information and exit.", &versionWanted,
131                 "H|header",      "              Treat the first line of each file as a header.", &hasHeader,
132                 std.getopt.config.caseInsensitive,
133 
134                 "f|fields",      "<field-list>  Fields to use as the key. Default: 0 (entire line).",
135                 fields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex,Yes.allowFieldNumZero),
136 
137                 "i|ignore-case", "              Ignore case when comparing keys.", &ignoreCase,
138                 "r|repeated",    "              Output only lines that are repeated (based on the key).", &onlyRepeated,
139                 "a|at-least",    "INT           Output only lines that are repeated INT times (based on the key). Zero and one are ignored.", &atLeast,
140                 "m|max",         "INT           Max number of each unique key to output (zero is ignored).", &max,
141                 "e|equiv",       "              Output equiv class IDs rather than uniq'ing entries.", &equivMode,
142                 "equiv-header",  "STR           Use STR as the equiv-id field header. Applies when using '--header --equiv'. Default: 'equiv_id'.", &equivHeader,
143                 "equiv-start",   "INT           Use INT as the first equiv-id. Default: 1.", &equivStartID,
144                 "d|delimiter",   "CHR           Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
145                 );
146 
147             if (r.helpWanted)
148             {
149                 defaultGetoptPrinter(helpText, r.options);
150                 return tuple(false, 0);
151             }
152             else if (helpVerbose)
153             {
154                 defaultGetoptPrinter(helpTextVerbose, r.options);
155                 return tuple(false, 0);
156             }
157             else if (versionWanted)
158             {
159                 import tsvutils_version;
160                 writeln(tsvutilsVersionNotice("tsv-uniq"));
161                 return tuple(false, 0);
162             }
163 
164             /* Consistency checks */
165             if (!equivMode)
166             {
167                 if (equivHeader != defaultEquivHeader)
168                 {
169                     throw new Exception("--equiv-header requires --e|equiv");
170                 }
171                 else if (equivStartID != defaultEquivStartID)
172                 {
173                     throw new Exception("--equiv-start requires --e|equiv");
174                 }
175             }
176 
177             if (fields.length > 1 && fields.any!(x => x == 0))
178             {
179                 throw new Exception("Whole line as key (--f|field 0) cannot be combined with multiple fields.");
180             }
181 
182             /* Derivations */
183             if (fields.length == 0)
184             {
185                 keyIsFullLine = true;
186             }
187             else if (fields.length == 1 && fields[0] == 0)
188             {
189                 keyIsFullLine = true;
190                 fields.length = 0;
191             }
192 
193             if (onlyRepeated && atLeast <= 1) atLeast = 2;
194             if (atLeast >= 2 && max < atLeast)
195             {
196                 // Don't modify max if it is zero and equivMode is in effect.
197                 if (max != 0 || !equivMode) max = atLeast;
198             }
199 
200             fields.each!((ref x) => --x);  // Convert to 1-based indexing.
201 
202         }
203         catch (Exception exc)
204         {
205             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
206             return tuple(false, 1);
207         }
208         return tuple(true, 0);
209     }
210 }
211 
212 int main(string[] cmdArgs)
213 {
214     /* When running in DMD code coverage mode, turn on report merging. */
215     version(D_Coverage) version(DigitalMars)
216     {
217         import core.runtime : dmd_coverSetMerge;
218         dmd_coverSetMerge(true);
219     }
220 
221     TsvUniqOptions cmdopt;
222     auto r = cmdopt.processArgs(cmdArgs);
223     if (!r[0]) return r[1];
224     try tsvUniq(cmdopt, cmdArgs[1..$]);
225     catch (Exception exc)
226     {
227         stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
228         return 1;
229     }
230     return 0;
231 }
232 
233 void tsvUniq(in TsvUniqOptions cmdopt, in string[] inputFiles)
234 {
235     import tsvutil : InputFieldReordering, BufferedOutputRange;
236     import std.algorithm : splitter;
237     import std.array : join;
238     import std.conv : to;
239     import std.range;
240     import std.uni : toLower;
241 
242     /* InputFieldReordering maps the key fields from an input line to a separate buffer. */
243     auto keyFieldsReordering = new InputFieldReordering!char(cmdopt.fields);
244 
245     /* BufferedOutputRange is a performance enhancement for writing to stdout. */
246     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
247 
248     /* The master hash. The key is the specified fields concatenated together (including
249      * separators). The value is a struct with the equiv-id and occurrence count.
250      */
251     struct EquivEntry { size_t equivID; size_t count; }
252     EquivEntry[string] equivHash;
253 
254     size_t numFields = cmdopt.fields.length;
255     long nextEquivID = cmdopt.equivStartID;
256     bool headerWritten = false;
257     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
258     {
259         auto inputStream = (filename == "-") ? stdin : filename.File();
260         foreach (lineNum, line; inputStream.byLine.enumerate(1))
261         {
262             if (cmdopt.hasHeader && lineNum == 1)
263             {
264                 /* Header line. */
265                 if (!headerWritten)
266                 {
267                     bufferedOutput.append(line);
268                     if (cmdopt.equivMode)
269                     {
270                         bufferedOutput.append(cmdopt.delim);
271                         bufferedOutput.append(cmdopt.equivHeader);
272                     }
273                     bufferedOutput.appendln();
274                     headerWritten = true;
275                 }
276             }
277             else
278             {
279                 /* Regular line (not header). Start by finding the key. */
280                 typeof(line) key;
281                 if (cmdopt.keyIsFullLine)
282                 {
283                     key = line;
284                 }
285                 else
286                 {
287                     /* Copy the key fields to a new buffer. */
288                     keyFieldsReordering.initNewLine;
289                     foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
290                     {
291                         keyFieldsReordering.processNextField(fieldIndex, fieldValue);
292                         if (keyFieldsReordering.allFieldsFilled) break;
293                     }
294 
295                     if (!keyFieldsReordering.allFieldsFilled)
296                     {
297                         throw new Exception(
298                             format("Not enough fields in line. File: %s, Line: %s",
299                                    (filename == "-") ? "Standard Input" : filename, lineNum));
300                     }
301 
302                     key = keyFieldsReordering.outputFields.join(cmdopt.delim);
303                 }
304 
305                 if (cmdopt.ignoreCase) key = key.toLower;
306 
307                 bool isOutput = false;
308                 EquivEntry currEntry;
309                 EquivEntry* priorEntry = (key in equivHash);
310                 if (priorEntry is null)
311                 {
312                     isOutput = (cmdopt.atLeast <= 1);
313                     currEntry.equivID = nextEquivID;
314                     currEntry.count = 1;
315                     equivHash[key.to!string] = currEntry;
316                     nextEquivID++;
317                 }
318                 else
319                 {
320                     (*priorEntry).count++;
321                     currEntry = *priorEntry;
322 
323                     if ((currEntry.count <= cmdopt.max && currEntry.count >= cmdopt.atLeast) ||
324                         (cmdopt.equivMode && cmdopt.max == 0))
325                     {
326                         isOutput = true;
327                     }
328                 }
329 
330                 if (isOutput)
331                 {
332                     bufferedOutput.append(line);
333                     if (cmdopt.equivMode)
334                     {
335                         bufferedOutput.append(cmdopt.delim);
336                         bufferedOutput.append(currEntry.equivID.to!string);
337                     }
338                     bufferedOutput.appendln();
339                 }
340             }
341         }
342     }
343 }