forked from External/ergo
use the TR39 skeleton algorithm to prevent confusables (#178)
This commit is contained in:
parent
a11486d699
commit
b9b2553a2f
9 changed files with 271 additions and 76 deletions
|
|
@ -8,21 +8,25 @@ package irc
|
|||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/mtibben/confusables"
|
||||
"golang.org/x/text/secure/precis"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
)
|
||||
|
||||
const (
|
||||
casemappingName = "rfc8265"
|
||||
)
|
||||
|
||||
// Casefold returns a casefolded string, without doing any name or channel character checks.
|
||||
func Casefold(str string) (string, error) {
|
||||
var err error
|
||||
oldStr := str
|
||||
// Each pass of PRECIS casefolding is a composition of idempotent operations,
|
||||
// but not idempotent itself. Therefore, the spec says "do it four times and hope
|
||||
// it converges" (lolwtf). Golang's PRECIS implementation has a "repeat" option,
|
||||
// which provides this functionality, but unfortunately it's not exposed publicly.
|
||||
func iterateFolding(profile *precis.Profile, oldStr string) (str string, err error) {
|
||||
str = oldStr
|
||||
// follow the stabilizing rules laid out here:
|
||||
// https://tools.ietf.org/html/draft-ietf-precis-7564bis-10.html#section-7
|
||||
for i := 0; i < 4; i++ {
|
||||
str, err = precis.UsernameCaseMapped.CompareKey(str)
|
||||
str, err = profile.CompareKey(str)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
|
@ -37,6 +41,11 @@ func Casefold(str string) (string, error) {
|
|||
return str, nil
|
||||
}
|
||||
|
||||
// Casefold returns a casefolded string, without doing any name or channel character checks.
|
||||
func Casefold(str string) (string, error) {
|
||||
return iterateFolding(precis.UsernameCaseMapped, str)
|
||||
}
|
||||
|
||||
// CasefoldChannel returns a casefolded version of a channel name.
|
||||
func CasefoldChannel(name string) (string, error) {
|
||||
if len(name) == 0 {
|
||||
|
|
@ -96,3 +105,46 @@ func CasefoldName(name string) (string, error) {
|
|||
|
||||
return lowered, err
|
||||
}
|
||||
|
||||
// "boring" names are exempt from skeletonization.
|
||||
// this is because confusables.txt considers various pure ASCII alphanumeric
|
||||
// strings confusable: 0 and O, 1 and l, m and rn. IMO this causes more problems
|
||||
// than it solves.
|
||||
func isBoring(name string) bool {
|
||||
for i := 0; i < len(name); i += 1 {
|
||||
chr := name[i]
|
||||
if (chr >= 'a' && chr <= 'z') || (chr >= 'A' && chr <= 'Z') || (chr >= '0' && chr <= '9') {
|
||||
continue // alphanumerics
|
||||
}
|
||||
switch chr {
|
||||
case '$', '%', '^', '&', '(', ')', '{', '}', '[', ']', '<', '>', '=':
|
||||
continue // benign printable ascii characters
|
||||
default:
|
||||
return false // potentially confusable ascii like | ' `, non-ascii
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
var skeletonCasefolder = precis.NewIdentifier(precis.FoldWidth, precis.LowerCase(), precis.Norm(norm.NFC))
|
||||
|
||||
// similar to Casefold, but exempts the bidi rule, because skeletons may
|
||||
// mix scripts strangely
|
||||
func casefoldSkeleton(str string) (string, error) {
|
||||
return iterateFolding(skeletonCasefolder, str)
|
||||
}
|
||||
|
||||
// Skeleton produces a canonicalized identifier that tries to catch
|
||||
// homoglyphic / confusable identifiers. It's a tweaked version of the TR39
|
||||
// skeleton algorithm. We apply the skeleton algorithm first and only then casefold,
|
||||
// because casefolding first would lose some information about visual confusability.
|
||||
// This has the weird consequence that the skeleton is not a function of the
|
||||
// casefolded identifier --- therefore it must always be computed
|
||||
// from the original (unfolded) identifier and stored/tracked separately from the
|
||||
// casefolded identifier.
|
||||
func Skeleton(name string) (string, error) {
|
||||
if !isBoring(name) {
|
||||
name = confusables.Skeleton(name)
|
||||
}
|
||||
return casefoldSkeleton(name)
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue