1 /**
2 Command line tool using fields in a tab-separated value file to identify equivalent
3 lines. Can either remove the duplicate entries or mark as equivalence classes.
4 
5 This tool reads a tab-separated value file line by line, using one or more fields to
6 record a key. If the same key is found in a subsequent line, it is identified as
7 equivalent. When operating in 'uniq' mode, the first time a key is seen the line is
8 written to standard output, but subsequent matching lines are discarded.
9 
10 The alternate to 'uniq' is 'equiv-class' identification. In this mode, all lines
11 written to standard output, but a new field is added marking equivalent entries with
12 with an ID. The ID is simply a one-upped counter.
13 
14 Copyright (c) 2015-2018, eBay Software Foundation
15 Initially written by Jon Degenhardt
16 
17 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
18 */
19 module tsv_uniq;
20 
21 import std.stdio;
22 import std.format : format;
23 import std.typecons : tuple;
24 
25 auto helpText = q"EOS
26 Synopsis: tsv-uniq [options] [file...]
27 
28 tsv-uniq filters out duplicate lines using fields as a key. Filtering is based
29 on the entire line if a key is not provided.
30 
31 Options:
32 EOS";
33 
34 auto helpTextVerbose = q"EOS
35 Synopsis: tsv-uniq [options] [file...]
36 
37 tsv-uniq identifies equivalent lines in tab-separated value files. Input is read
38 line by line, recording a key based on one or more of the fields. Two lines are
39 equivalent if they have the same key. When operating in 'uniq' mode, the first
40 time a key is seen the line is written to standard output, but subsequent lines
41 are discarded. This is similar to the unix 'uniq' program, but based on individual
42 fields and without requiring sorted data. This command uniq's on fields 2 and 3:
43 
44    tsv-uniq -f 2,3 file.tsv
45 
46 The alternate to 'uniq' mode is 'equiv-class' identification. In this mode, all
47 lines are written to standard output, but with a new field added marking
48 equivalent entries with an ID. The ID is simply a one-upped counter. Example:
49 
50    tsv-uniq --header -f 2,3 --equiv file.tsv
51 
52 tsv-uniq can be run without specifying a key field. In this case the whole line
53 is used as a key, same as the Unix 'uniq' program. This works on any line-oriented
54 text file, not just TSV files.
55 
56 The '--r|repeated' option can be used to print only lines occurring more than
57 once. '--a|at-least N' is similar, except that it only prints lines occuring at
58 least N times. For both, the Nth line found is printed, in the order found.
59 
60 The '--m|max MAX' option changes the behavior to output the first MAX lines for
61 each key, rather than just the first line for each key. This can also with used
62 with '--e|equiv' to limit the number output for each equivalence class.
63 
64 It's not obvious when both '--a|at-least' and '--m|max' might be useful, but, if
65 both are specified, the occurrences between 'at-least' and 'max' are output.
66 
67 Options:
68 EOS";
69 
70 /** Container for command line options.
71  */
72 struct TsvUniqOptions
73 {
74     enum defaultEquivHeader = "equiv_id";
75     enum defaultEquivStartID = 1;
76 
77     string programName;
78     bool helpVerbose = false;                 // --h|help-verbose
79     bool versionWanted = false;               // --V|version
80     size_t[] fields;                          // --f|fields
81     bool hasHeader = false;                   // --H|header
82     bool onlyRepeated = false;                // --r|repeated. Shorthand for '--atleast 2'
83     size_t atLeast = 0;                       // --a|at-least. Zero implies default behavior.
84     size_t max = 0;                           // --m|max. Zero implies default behavior.
85     bool equivMode = false;                   // --e|equiv
86     string equivHeader = defaultEquivHeader;  // --equiv-header
87     long equivStartID = defaultEquivStartID;  // --equiv-start
88     bool ignoreCase = false;                  // --i|ignore-case
89     char delim = '\t';                        // --d|delimiter
90     bool keyIsFullLine = false;               // Derived. True if no fields specified or '--f|fields 0'
91 
92     /* Returns a tuple. First value is true if command line arguments were successfully
93      * processed and execution should continue, or false if an error occurred or the user
94      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
95      *
96      * Returning true (execution continues) means args have been validated and derived
97      * values calculated. In addition, field indices have been converted to zero-based.
98      * If the whole line is the key, the individual fields list will be cleared.
99      *
100      * Repeat count control variables 'atLeast' and max' - The general idea is that the
101      * values are left at zero if no repeat count options are specified. They are set if
102      * repeat count options are specified, as follows:
103      *   * atLeast - Will be zero unless --r|repeated or --a|at-least is specified.
104      *     --r|repeated option sets it 2, --a|at-least sets it to the specified value.
105      *   * max - Default to zero. Is set to the --m|max value if provided. Is set to
106      *    'atLeast' if --r|repeated or --a|at-least is provided.
107      *
108      * An exception to the above: If --e|equiv-mode is specified, then (max == 0)
109      * represents the default "output all values" case. In this case max may be less
110      * than the at-least value.
111      */
112     auto processArgs (ref string[] cmdArgs)
113     {
114         import std.algorithm : any, each;
115         import std.getopt;
116         import std.path : baseName, stripExtension;
117         import std.typecons : Yes, No;
118         import tsvutil :  makeFieldListOptionHandler;
119 
120         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
121 
122         try
123         {
124             arraySep = ",";    // Use comma to separate values in command line options
125             auto r = getopt(
126                 cmdArgs,
127                 "help-verbose",  "              Print full help.", &helpVerbose,
128                 std.getopt.config.caseSensitive,
129                 "V|version",     "              Print version information and exit.", &versionWanted,
130                 "H|header",      "              Treat the first line of each file as a header.", &hasHeader,
131                 std.getopt.config.caseInsensitive,
132 
133                 "f|fields",      "<field-list>  Fields to use as the key. Default: 0 (entire line).",
134                 fields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex,Yes.allowFieldNumZero),
135 
136                 "i|ignore-case", "              Ignore case when comparing keys.", &ignoreCase,
137                 "r|repeated",    "              Output only lines that are repeated (based on the key).", &onlyRepeated,
138                 "a|at-least",    "INT           Output only lines that are repeated INT times (based on the key). Zero and one are ignored.", &atLeast,
139                 "m|max",         "INT           Max number of each unique key to output (zero is ignored).", &max,
140                 "e|equiv",       "              Output equiv class IDs rather than uniq'ing entries.", &equivMode,
141                 "equiv-header",  "STR           Use STR as the equiv-id field header. Applies when using '--header --equiv'. Default: 'equiv_id'.", &equivHeader,
142                 "equiv-start",   "INT           Use INT as the first equiv-id. Default: 1.", &equivStartID,
143                 "d|delimiter",   "CHR           Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
144                 );
145 
146             if (r.helpWanted)
147             {
148                 defaultGetoptPrinter(helpText, r.options);
149                 return tuple(false, 0);
150             }
151             else if (helpVerbose)
152             {
153                 defaultGetoptPrinter(helpTextVerbose, r.options);
154                 return tuple(false, 0);
155             }
156             else if (versionWanted)
157             {
158                 import tsvutils_version;
159                 writeln(tsvutilsVersionNotice("tsv-uniq"));
160                 return tuple(false, 0);
161             }
162 
163             /* Consistency checks */
164             if (!equivMode)
165             {
166                 if (equivHeader != defaultEquivHeader)
167                 {
168                     throw new Exception("--equiv-header requires --e|equiv");
169                 }
170                 else if (equivStartID != defaultEquivStartID)
171                 {
172                     throw new Exception("--equiv-start requires --e|equiv");
173                 }
174             }
175 
176             if (fields.length > 1 && fields.any!(x => x == 0))
177             {
178                 throw new Exception("Whole line as key (--f|field 0) cannot be combined with multiple fields.");
179             }
180 
181             /* Derivations */
182             if (fields.length == 0)
183             {
184                 keyIsFullLine = true;
185             }
186             else if (fields.length == 1 && fields[0] == 0)
187             {
188                 keyIsFullLine = true;
189                 fields.length = 0;
190             }
191 
192             if (onlyRepeated && atLeast <= 1) atLeast = 2;
193             if (atLeast >= 2 && max < atLeast)
194             {
195                 // Don't modify max if it is zero and equivMode is in effect.
196                 if (max != 0 || !equivMode) max = atLeast;
197             }
198 
199             fields.each!((ref x) => --x);  // Convert to 1-based indexing.
200 
201         }
202         catch (Exception exc)
203         {
204             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
205             return tuple(false, 1);
206         }
207         return tuple(true, 0);
208     }
209 }
210 
211 /** Main program. Processes command line arguments and calls tsvUniq which implements
212  * the main processing logic.
213  */
214 int main(string[] cmdArgs)
215 {
216     /* When running in DMD code coverage mode, turn on report merging. */
217     version(D_Coverage) version(DigitalMars)
218     {
219         import core.runtime : dmd_coverSetMerge;
220         dmd_coverSetMerge(true);
221     }
222 
223     TsvUniqOptions cmdopt;
224     auto r = cmdopt.processArgs(cmdArgs);
225     if (!r[0]) return r[1];
226     try tsvUniq(cmdopt, cmdArgs[1..$]);
227     catch (Exception exc)
228     {
229         stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
230         return 1;
231     }
232     return 0;
233 }
234 
235 /** Outputs the unique lines from all the input files.
236  *
237  * Processes the lines in each input file. All lines are added to an associated array.
238  * The first time a line is seen it is output. If key fields are being used these are
239  * used as the basis for the associative array entries rather than the full line.
240  */
241 void tsvUniq(in TsvUniqOptions cmdopt, in string[] inputFiles)
242 {
243     import tsvutil : InputFieldReordering, BufferedOutputRange;
244     import std.algorithm : splitter;
245     import std.array : join;
246     import std.conv : to;
247     import std.range;
248     import std.uni : toLower;
249 
250     /* InputFieldReordering maps the key fields from an input line to a separate buffer. */
251     auto keyFieldsReordering = new InputFieldReordering!char(cmdopt.fields);
252 
253     /* BufferedOutputRange is a performance enhancement for writing to stdout. */
254     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
255 
256     /* The master hash. The key is the specified fields concatenated together (including
257      * separators). The value is a struct with the equiv-id and occurrence count.
258      */
259     struct EquivEntry { size_t equivID; size_t count; }
260     EquivEntry[string] equivHash;
261 
262     size_t numFields = cmdopt.fields.length;
263     long nextEquivID = cmdopt.equivStartID;
264     bool headerWritten = false;
265     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
266     {
267         auto inputStream = (filename == "-") ? stdin : filename.File();
268         foreach (lineNum, line; inputStream.byLine.enumerate(1))
269         {
270             if (cmdopt.hasHeader && lineNum == 1)
271             {
272                 /* Header line. */
273                 if (!headerWritten)
274                 {
275                     bufferedOutput.append(line);
276                     if (cmdopt.equivMode)
277                     {
278                         bufferedOutput.append(cmdopt.delim);
279                         bufferedOutput.append(cmdopt.equivHeader);
280                     }
281                     bufferedOutput.appendln();
282                     headerWritten = true;
283                 }
284             }
285             else
286             {
287                 /* Regular line (not header). Start by finding the key. */
288                 typeof(line) key;
289                 if (cmdopt.keyIsFullLine)
290                 {
291                     key = line;
292                 }
293                 else
294                 {
295                     /* Copy the key fields to a new buffer. */
296                     keyFieldsReordering.initNewLine;
297                     foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
298                     {
299                         keyFieldsReordering.processNextField(fieldIndex, fieldValue);
300                         if (keyFieldsReordering.allFieldsFilled) break;
301                     }
302 
303                     if (!keyFieldsReordering.allFieldsFilled)
304                     {
305                         throw new Exception(
306                             format("Not enough fields in line. File: %s, Line: %s",
307                                    (filename == "-") ? "Standard Input" : filename, lineNum));
308                     }
309 
310                     key = keyFieldsReordering.outputFields.join(cmdopt.delim);
311                 }
312 
313                 if (cmdopt.ignoreCase) key = key.toLower;
314 
315                 bool isOutput = false;
316                 EquivEntry currEntry;
317                 EquivEntry* priorEntry = (key in equivHash);
318                 if (priorEntry is null)
319                 {
320                     isOutput = (cmdopt.atLeast <= 1);
321                     currEntry.equivID = nextEquivID;
322                     currEntry.count = 1;
323                     equivHash[key.to!string] = currEntry;
324                     nextEquivID++;
325                 }
326                 else
327                 {
328                     (*priorEntry).count++;
329                     currEntry = *priorEntry;
330 
331                     if ((currEntry.count <= cmdopt.max && currEntry.count >= cmdopt.atLeast) ||
332                         (cmdopt.equivMode && cmdopt.max == 0))
333                     {
334                         isOutput = true;
335                     }
336                 }
337 
338                 if (isOutput)
339                 {
340                     bufferedOutput.append(line);
341                     if (cmdopt.equivMode)
342                     {
343                         bufferedOutput.append(cmdopt.delim);
344                         bufferedOutput.append(currEntry.equivID.to!string);
345                     }
346                     bufferedOutput.appendln();
347                 }
348             }
349         }
350     }
351 }