/**
 * @name Duplication in regular expression character class
 * @description Duplicate characters in a class have no effect and may indicate an error in the regular expression.
 * @kind problem
 * @tags quality
 *       maintainability
 *       readability
 * @problem.severity warning
 * @sub-severity low
 * @precision very-high
 * @id py/regex/duplicate-in-character-class
 */

import python
import semmle.python.regex

predicate duplicate_char_in_class(RegExp r, string char) {
  exists(int i, int j, int x, int y, int start, int end |
    i != x and
    j != y and
    start < i and
    j < end and
    start < x and
    y < end and
    r.character(i, j) and
    char = r.getText().substring(i, j) and
    r.character(x, y) and
    char = r.getText().substring(x, y) and
    r.charSet(start, end)
  ) and
  /* Exclude � as we use it for any unencodable character */
  char != "�" and
  //Ignore whitespace in verbose mode
  not (
    r.getAMode() = "VERBOSE" and
    char in [" ", "\t", "\r", "\n"]
  )
}

from RegExp r, string char
where duplicate_char_in_class(r, char)
select r,
  "This regular expression includes duplicate character '" + char + "' in a set of characters."