Using CHARINDEXand SUBSTRING:
He considers 'amazon.com', http://amazon.com'and https://amazon.com'as duplicates.
WITH MyTable(OriginalURL) AS(
SELECT 'http://Amazon.com' UNION ALL
SELECT 'https://Amazon.com' UNION ALL
SELECT 'Amazon.com' UNION ALL
SELECT 'http://Stackoverflow.com' UNION ALL
SELECT 'Stackoverflow.com' UNION ALL
SELECT 'http://google.com'
)
SELECT
CASE
WHEN CHARINDEX('http://', OriginalURL, 1) > 0 THEN SUBSTRING(OriginalURL, 8, LEN(OriginalURL) - 7)
WHEN CHARINDEX('https://', OriginalURL, 1) > 0 THEN SUBSTRING(OriginalURL, 9, LEN(OriginalURL) - 8)
ELSE OriginalURL
END AS OriginalURL
, COUNT(*) AS DupeCount
FROM MyTable
GROUP BY
CASE
WHEN CHARINDEX('http://', OriginalURL, 1) > 0 THEN SUBSTRING(OriginalURL, 8, LEN(OriginalURL) - 7)
WHEN CHARINDEX('https://', OriginalURL, 1) > 0 THEN SUBSTRING(OriginalURL, 9, LEN(OriginalURL) - 8)
ELSE OriginalURL
END
HAVING COUNT(*) > 1
source
share