1 /**
2 Command line tool that identifies equivalent lines in an input stream. Equivalent
3 lines are identified using either the full line or a set of fields as the key. By
4 default, input is written to standard output, retaining only the first occurrence of
5 equivalent lines. There are also options for marking and numbering equivalent lines
6 rather, without filtering out duplicates.
7 
8 This tool is similar in spirit to the Unix 'uniq' tool, with some key differences.
9 First, the key can be composed of individual fields, not just the full line. Second,
10 input does not need to be sorted. (Unix 'uniq' only detects equivalent lines when
11 they are adjacent, hence the usual need for sorting.)
12 
13 There are a couple alternative to uniq'ing the input lines. One is to mark lines with
14 an equivalence ID, which is a one-upped counter. The other is to number lines, with
15 each unique key have its own set of numbers.
16 
17 Copyright (c) 2015-2020, eBay Inc.
18 Initially written by Jon Degenhardt
19 
20 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
21 */
22 module tsv_utils.tsv_uniq;
23 
24 import std.exception : enforce;
25 import std.format : format;
26 import std.range;
27 import std.stdio;
28 import std.typecons : tuple;
29 
30 auto helpText = q"EOS
31 Synopsis: tsv-uniq [options] [file...]
32 
33 tsv-uniq filters out duplicate lines using fields as a key. Filtering is
34 based on the entire line when key fields are not provided. Options are
35 also available for assigning a unique id to each key and numbering the
36 occurrences of each key. Use '--help-verbose' for more details.
37 
38 Options:
39 EOS";
40 
41 auto helpTextVerbose = q"EOS
42 Synopsis: tsv-uniq [options] [file...]
43 
44 tsv-uniq identifies equivalent lines in tab-separated value files. Input
45 is read line by line, recording a key for each line based on one or more
46 of the fields. Two lines are equivalent if they have the same key. The
47 first time a key is seen its line is written to standard output.
48 Subsequent lines containing the same key are discarded. This command
49 uniq's a file on fields 2 and 3:
50 
51    tsv-uniq -f 2,3 file.tsv
52 
53 This is similar to the Unix 'uniq' program, but based on individual
54 fields and without requiring sorted data.
55 
56 tsv-uniq can be run without specifying a key field. In this case the
57 whole line is used as a key, same as the Unix 'uniq' program. This works
58 on any line-oriented text file, not just TSV files.
59 
60 The above is the default behavior ('uniq' mode). The alternates to 'uniq'
61 mode are 'number' mode and 'equiv-class' mode. In 'equiv-class' mode, all
62 lines are written to standard output, but with a field appended marking
63 equivalent entries with an ID. The ID is a one-upped counter. Example:
64 
65    tsv-uniq --header -f 2,3 --equiv file.tsv
66 
67 'Number' mode also writes all lines to standard output, but with a field
68 appended numbering the occurrence count for the line's key. The first line
69 with a specific key is assigned the number '1', the second with the key is
70 assigned number '2', etc. 'Number' and 'equiv-class' modes can be combined.
71 
72 The '--r|repeated' option can be used to print only lines occurring more
73 than once. Specifically, the second occurrence of a key is printed. The
74 '--a|at-least N' option is similar, printing lines occurring at least N
75 times. (Like repeated, the Nth line with the key is printed.)
76 
77 The '--m|max MAX' option changes the behavior to output the first MAX
78 lines for each key, rather than just the first line for each key.
79 
80 If both '--a|at-least' and '--m|max' are specified, the occurrences
81 starting with 'at-least' and ending with 'max' are output.
82 
83 Options:
84 EOS";
85 
86 /** Container for command line options.
87  */
88 struct TsvUniqOptions
89 {
90     import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader;
91 
92     enum defaultEquivHeader = "equiv_id";
93     enum defaultEquivStartID = 1;
94     enum defaultNumberHeader = "equiv_line";
95 
96     string programName;
97     InputSourceRange inputSources;            // Input files
98     bool helpVerbose = false;                 // --h|help-verbose
99     bool versionWanted = false;               // --V|version
100     size_t[] fields;                          // --f|fields
101     bool hasHeader = false;                   // --H|header
102     bool onlyRepeated = false;                // --r|repeated. Shorthand for '--atleast 2'
103     size_t atLeast = 0;                       // --a|at-least. Zero implies default behavior.
104     size_t max = 0;                           // --m|max. Zero implies default behavior.
105     bool numberMode = false;                  // --z|number
106     string numberHeader = defaultNumberHeader;  // --number-header
107     bool equivMode = false;                   // --e|equiv
108     string equivHeader = defaultEquivHeader;  // --equiv-header
109     long equivStartID = defaultEquivStartID;  // --equiv-start
110     bool ignoreCase = false;                  // --i|ignore-case
111     char delim = '\t';                        // --d|delimiter
112     bool keyIsFullLine = false;               // Derived. True if no fields specified or '--f|fields 0'
113 
114     /* Returns a tuple. First value is true if command line arguments were successfully
115      * processed and execution should continue, or false if an error occurred or the user
116      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
117      *
118      * Returning true (execution continues) means args have been validated and derived
119      * values calculated. In addition, field indices have been converted to zero-based.
120      * If the whole line is the key, the individual fields list will be cleared.
121      *
122      * Repeat count control variables 'atLeast' and max' - These values are left at zero
123      * if no repeat count options are specified. They are set if repeat count options
124      * are specified, as follows:
125      *   * atLeast - Will be zero unless --r|repeated or --a|at-least is specified.
126      *     --r|repeated option sets it 2, --a|at-least sets it to the specified value.
127      *   * max - Default to zero. Is set to the --m|max value if provided. Is set to
128      *    'atLeast' if --r|repeated or --a|at-least is provided.
129      *
130      * An exception to the above: If --e|equiv-mode is specified, then (max == 0)
131      * represents the default "output all values" case. In this case max may be less
132      * than the at-least value.
133      */
134     auto processArgs (ref string[] cmdArgs)
135     {
136         import std.algorithm : all, each;
137         import std.getopt;
138         import std.path : baseName, stripExtension;
139         import std.typecons : Yes, No;
140         import tsv_utils.common.utils :  makeFieldListOptionHandler;
141 
142         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
143 
144         try
145         {
146             arraySep = ",";    // Use comma to separate values in command line options
147             auto r = getopt(
148                 cmdArgs,
149                 "help-verbose",  "              Print full help.", &helpVerbose,
150                 std.getopt.config.caseSensitive,
151                 "V|version",     "              Print version information and exit.", &versionWanted,
152                 "H|header",      "              Treat the first line of each file as a header.", &hasHeader,
153                 std.getopt.config.caseInsensitive,
154 
155                 "f|fields",      "<field-list>  Fields to use as the key. Default: 0 (entire line).",
156                 fields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex,Yes.allowFieldNumZero),
157 
158                 "i|ignore-case", "              Ignore case when comparing keys.", &ignoreCase,
159                 "r|repeated",    "              Output only lines that are repeated (based on the key).", &onlyRepeated,
160                 "a|at-least",    "INT           Output only lines that are repeated INT times (based on the key). Zero and one are ignored.", &atLeast,
161                 "m|max",         "INT           Max number of each unique key to output (zero is ignored).", &max,
162                 "e|equiv",       "              Output equivalence class IDs rather than uniq'ing entries.", &equivMode,
163                 "equiv-header",  "STR           Use STR as the equiv-id field header (when using '-H --equiv'). Default: 'equiv_id'.", &equivHeader,
164                 "equiv-start",   "INT           Use INT as the first equiv-id. Default: 1.", &equivStartID,
165                 "z|number",      "              Output equivalence class occurrence counts rather than uniq'ing entries.", &numberMode,
166                 "number-header", "STR           Use STR as the '--number' field header (when using '-H --number)'. Default: 'equiv_line'.", &numberHeader,
167                 "d|delimiter",   "CHR           Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
168                 );
169 
170             if (r.helpWanted)
171             {
172                 defaultGetoptPrinter(helpText, r.options);
173                 return tuple(false, 0);
174             }
175             else if (helpVerbose)
176             {
177                 defaultGetoptPrinter(helpTextVerbose, r.options);
178                 return tuple(false, 0);
179             }
180             else if (versionWanted)
181             {
182                 import tsv_utils.common.tsvutils_version;
183                 writeln(tsvutilsVersionNotice("tsv-uniq"));
184                 return tuple(false, 0);
185             }
186 
187             /* Input files. Remaining command line args are files. */
188             string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
189             cmdArgs.length = 1;
190             ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
191             inputSources = inputSourceRange(filepaths, readHeader);
192 
193             /* Consistency checks */
194             if (!equivMode)
195             {
196                 enforce(equivHeader == defaultEquivHeader, "--equiv-header requires --e|equiv");
197                 enforce(equivStartID == defaultEquivStartID, "--equiv-start requires --e|equiv");
198             }
199 
200             enforce(numberMode || numberHeader == defaultNumberHeader,
201                     "--number-header requires --z|number");
202 
203             enforce(fields.length <= 1 || fields.all!(x => x != 0),
204                     "Whole line as key (--f|field 0) cannot be combined with multiple fields.");
205 
206             /* Derivations */
207             if (fields.length == 0)
208             {
209                 keyIsFullLine = true;
210             }
211             else if (fields.length == 1 && fields[0] == 0)
212             {
213                 keyIsFullLine = true;
214                 fields.length = 0;
215             }
216 
217             if (onlyRepeated && atLeast <= 1) atLeast = 2;
218             if (atLeast >= 2 && max < atLeast)
219             {
220                 // Don't modify max if it is zero and equivMode or numberMode is in effect.
221                 if (max != 0 || (!equivMode && !numberMode)) max = atLeast;
222             }
223 
224             if (!keyIsFullLine) fields.each!((ref x) => --x);  // Convert to 0-based indexing.
225 
226         }
227         catch (Exception exc)
228         {
229             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
230             return tuple(false, 1);
231         }
232         return tuple(true, 0);
233     }
234 }
235 
236 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
237 
238 /** Main program. Processes command line arguments and calls tsvUniq which implements
239  * the main processing logic.
240  */
241 int main(string[] cmdArgs)
242 {
243     /* When running in DMD code coverage mode, turn on report merging. */
244     version(D_Coverage) version(DigitalMars)
245     {
246         import core.runtime : dmd_coverSetMerge;
247         dmd_coverSetMerge(true);
248     }
249 
250     TsvUniqOptions cmdopt;
251     auto r = cmdopt.processArgs(cmdArgs);
252     if (!r[0]) return r[1];
253 
254     version(LDC_Profile)
255     {
256         import ldc.profile : resetAll;
257         resetAll();
258     }
259 
260     try tsvUniq(cmdopt);
261     catch (Exception exc)
262     {
263         stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
264         return 1;
265     }
266     return 0;
267 }
268 
269 /** Outputs the unique lines from all the input files.
270  *
271  * Processes the lines in each input file. All lines are added to an associated array.
272  * The first time a line is seen it is output. If key fields are being used these are
273  * used as the basis for the associative array entries rather than the full line.
274  */
275 void tsvUniq(ref TsvUniqOptions cmdopt)
276 {
277     import tsv_utils.common.utils : bufferedByLine, BufferedOutputRange,
278         InputFieldReordering, InputSourceRange, joinAppend, throwIfWindowsNewlineOnUnix;
279     import std.algorithm : splitter;
280     import std.array : appender;
281     import std.conv : to;
282     import std.uni : asLowerCase;
283     import std.utf : byChar;
284 
285     /* inputSources must be an InputSourceRange and include at least stdin. */
286     assert(!cmdopt.inputSources.empty);
287     static assert(is(typeof(cmdopt.inputSources) == InputSourceRange));
288 
289     /* InputFieldReordering maps the key fields from an input line to a separate buffer. */
290     auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.fields);
291 
292     /* BufferedOutputRange is a performance enhancement for writing to stdout. */
293     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
294 
295     /* The master hash. The key is the specified fields concatenated together (including
296      * separators). The value is a struct with the equiv-id and occurrence count.
297      */
298     static struct EquivEntry { size_t equivID; size_t count; }
299     EquivEntry[string] equivHash;
300 
301     /* Reusable buffers for multi-field keys and case-insensitive keys. */
302     auto multiFieldKeyBuffer = appender!(char[]);
303     auto lowerKeyBuffer = appender!(char[]);
304 
305     const size_t numKeyFields = cmdopt.fields.length;
306     long nextEquivID = cmdopt.equivStartID;
307 
308     /* First header is read during command line arg processing. */
309     if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty)
310     {
311         auto inputStream = cmdopt.inputSources.front;
312         throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
313 
314         bufferedOutput.append(inputStream.header);
315 
316         if (cmdopt.equivMode)
317         {
318             bufferedOutput.append(cmdopt.delim);
319             bufferedOutput.append(cmdopt.equivHeader);
320         }
321 
322         if (cmdopt.numberMode)
323         {
324             bufferedOutput.append(cmdopt.delim);
325             bufferedOutput.append(cmdopt.numberHeader);
326         }
327 
328         bufferedOutput.appendln();
329     }
330 
331     immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1;
332 
333     foreach (inputStream; cmdopt.inputSources)
334     {
335         if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1);
336 
337         foreach (lineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine))
338         {
339             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum);
340 
341             /* Start by finding the key. */
342             typeof(line) key;
343             if (cmdopt.keyIsFullLine)
344             {
345                 key = line;
346             }
347             else
348             {
349                 assert(keyFieldsReordering !is null);
350 
351                 /* Copy the key fields to a new buffer. */
352                 keyFieldsReordering.initNewLine;
353                 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate)
354                 {
355                     keyFieldsReordering.processNextField(fieldIndex, fieldValue);
356                     if (keyFieldsReordering.allFieldsFilled) break;
357                 }
358 
359                 enforce(keyFieldsReordering.allFieldsFilled,
360                         format("Not enough fields in line. File: %s, Line: %s",
361                                inputStream.name, lineNum));
362 
363                 if (numKeyFields == 1)
364                 {
365                     key = keyFieldsReordering.outputFields[0];
366                 }
367                 else
368                 {
369                     multiFieldKeyBuffer.clear();
370                     keyFieldsReordering.outputFields.joinAppend(multiFieldKeyBuffer, cmdopt.delim);
371                     key = multiFieldKeyBuffer.data;
372                 }
373             }
374 
375             if (cmdopt.ignoreCase)
376             {
377                 /* Equivalent to key = key.toLower, but without memory allocation. */
378                 lowerKeyBuffer.clear();
379                 lowerKeyBuffer.put(key.asLowerCase.byChar);
380                 key = lowerKeyBuffer.data;
381             }
382 
383             bool isOutput = false;
384             EquivEntry currEntry;
385             EquivEntry* priorEntry = (key in equivHash);
386             if (priorEntry is null)
387             {
388                 isOutput = (cmdopt.atLeast <= 1);
389                 currEntry.equivID = nextEquivID;
390                 currEntry.count = 1;
391                 equivHash[key.to!string] = currEntry;
392                 nextEquivID++;
393             }
394             else
395             {
396                 (*priorEntry).count++;
397                 currEntry = *priorEntry;
398 
399                 if ((currEntry.count <= cmdopt.max && currEntry.count >= cmdopt.atLeast) ||
400                     (cmdopt.equivMode && cmdopt.max == 0) ||
401                     (cmdopt.numberMode && cmdopt.max == 0))
402                 {
403                     isOutput = true;
404                 }
405             }
406 
407             if (isOutput)
408             {
409                 bufferedOutput.append(line);
410 
411                 if (cmdopt.equivMode)
412                 {
413                     bufferedOutput.append(cmdopt.delim);
414                     bufferedOutput.append(currEntry.equivID.to!string);
415                 }
416 
417                 if (cmdopt.numberMode)
418                 {
419                     bufferedOutput.append(cmdopt.delim);
420                     bufferedOutput.append(currEntry.count.to!string);
421                 }
422 
423                 bufferedOutput.appendln();
424             }
425         }
426     }
427 }