Grouping similar elements recursively

I read a Microsoft article on recursive queries using the CTE in the next article , and I just can't seem to wrap myself around how to use it for group common elements.

I have a table containing the following columns:

  • ID
  • Firstname
  • Lastname
  • DateOfBirth
  • Birthcountry
  • Groupid

What do I need to do is to start with the first person in the table, and iterate through the table and find all the people who have the same ( LastNameand BirthCountry) or the same ( DateOfBirthand BirthCountry).

Now the tricky part is that I have to assign them the same one GroupID, and then for each person in this GroupID, I need to see if someone else has the same information, and then put it in the same one GroupID.

I think I could do this with a few cursors, but it gets complicated.

Here is sample data and output.

ID          FirstName  LastName   DateOfBirth BirthCountry GroupID
----------- ---------- ---------- ----------- ------------ -----------
1           Jonh       Doe        1983-01-01  Grand        100
2           Jack       Stone      1976-06-08  Grand        100
3           Jane       Doe        1982-02-08  Grand        100
4           Adam       Wayne      1983-01-01  Grand        100
5           Kay        Wayne      1976-06-08  Grand        100
6           Matt       Knox       1983-01-01  Hay          101
  • John Doe and Jane Doe are in the same group (100) because they have the same (LastName and BirthCountry).

  • Adam Wayne is in group (100) because he has the same (BirthDate and BirthCountry) as John Doe.

  • Kay Wayne is in group (100) because she has the same (LastName and BirthCountry) as Adam Wayne, who is already in group (100).

  • Matt Knox is in the new group (101) because he does not match anyone in the previous groups.

  • (100), (BirthDate BirthCountry), , (100).

:

CREATE TABLE #Tbl(
    ID              INT,
    FirstName       VARCHAR(50),
    LastName        VARCHAR(50),
    DateOfBirth     DATE,
    BirthCountry    VARCHAR(50),
    GroupID         INT NULL
);

INSERT INTO #Tbl VALUES
(1, 'Jonh', 'Doe',      '1983-01-01',   'Grand',    NULL),
(2, 'Jack', 'Stone',    '1976-06-08',   'Grand',    NULL),
(3, 'Jane', 'Doe',      '1982-02-08',   'Grand',    NULL),
(4, 'Adam', 'Wayne',    '1983-01-01',   'Grand',    NULL),
(5, 'Kay',  'Wayne',    '1976-06-08',   'Grand',    NULL),
(6, 'Matt', 'Knox',     '1983-01-01',   'Hay',      NULL);
+4
3

. , . , Kay Adam .

with data as (
    select
        LastName, DateOfBirth, BirthCountry,
        row_number() over (order by LastName, DateOfBirth, BirthCountry) as grpNum
    from T group by LastName, DateOfBirth, BirthCountry
), r as (
    select
        d.LastName, d.DateOfBirth, d.BirthCountry, d.grpNum,
        cast('|'  + cast(d.grpNum as varchar(8)) + '|' as varchar(1024)) as equ
    from data as d
    union all
    select
        d.LastName, d.DateOfBirth, d.BirthCountry, r.grpNum,
        cast(r.equ + cast(d.grpNum as varchar(8)) + '|' as varchar(1024))
    from r inner join data as d
            on      d.grpNum > r.grpNum
               and charindex('|' + cast(d.grpNum as varchar(8)) + '|', r.equ) = 0
               and (d.LastName = r.LastName or d.DateOfBirth = r.DateOfBirth)
               and  d.BirthCountry = r.BirthCountry
), g as (
    select LastName, DateOfBirth, BirthCountry, min(grpNum) as grpNum
    from r group by LastName, DateOfBirth, BirthCountry
)
select t.*, dense_rank() over (order by g.grpNum) + 100 as GroupID
from T as t 
    inner join g
        on      g.LastName = t.LastName
            and g.DateOfBirth = t.DateOfBirth
            and g.BirthCountry = t.BirthCountry

( ), ( , ..). , , GROUP BY.

http://rextester.com/edit/TVRVZ10193

EDIT: , , , . row_number() say min(ID) as grpNum, , , ID .

+1

, groupid - , 100. , .

-, " ". RBAR. , , , RBAR.

, , SET BASE METHOD, .

, RBAR script , , . , .

Alsi script , id , , , .

print , .

    SET NOCOUNT ON
DECLARE @Tbl TABLE(
    ID              INT,
    FirstName       VARCHAR(50),
    LastName        VARCHAR(50),
    DateOfBirth     DATE,
    BirthCountry    VARCHAR(50),
    GroupID         INT NULL
);

INSERT INTO @Tbl VALUES
(1, 'Jonh', 'Doe',      '1983-01-01',   'Grand',    NULL) ,
(2, 'Jack', 'Stone',    '1976-06-08',   'Grand',    NULL),
(3, 'Jane', 'Doe',      '1982-02-08',   'Grand',    NULL),
(4, 'Adam', 'Wayne',    '1983-01-01',   'Grand',    NULL),
(5, 'Kay',  'Wayne',    '1976-06-08',   'Grand',    NULL),
(6, 'Matt', 'Knox',     '1983-01-01',   'Hay',      NULL),
(7, 'Jerry', 'Stone',   '1976-06-08',   'Hay',      NULL)


DECLARE @StartGroupid INT = 100
DECLARE @id INT
DECLARE @Groupid INT
DECLARE @Maxid INT
DECLARE @i INT = 1
DECLARE @MinGroupID int=@StartGroupid
DECLARE @MaxGroupID int=@StartGroupid
DECLARE @LastGroupID int
SELECT @maxid = max(id)
FROM @tbl

WHILE (@i <= @maxid)
BEGIN
    SELECT @id = id
        ,@Groupid = Groupid
    FROM @Tbl a
    WHERE id = @i

    if(@Groupid is not null and @Groupid<@MinGroupID)
    set @MinGroupID=@Groupid
    if(@Groupid is not null and @Groupid>@MaxGroupID)
    set @MaxGroupID=@Groupid
    if(@Groupid is not null)
    set @LastGroupID=@Groupid

    UPDATE A
    SET groupid =case 
            when @id=1 and  b.groupid is null then @StartGroupid 
            when @id>1 and  b.groupid is null then @MaxGroupID+1--(Select max(groupid)+1 from @tbl where id<@id)
            when @id>1 and  b.groupid is not null then @MinGroupID --(Select min(groupid) from @tbl where id<@id)
    end
    FROM @Tbl A
    INNER JOIN @tbl B ON b.id = @ID
    WHERE (
            (
                a.BirthCountry = b.BirthCountry
                and a.DateOfBirth = b.dateofbirth
                )
            or (a.LastName = b.LastName and a.BirthCountry = b.BirthCountry)
                 or (a.LastName = b.LastName and a.dateofbirth = b.dateofbirth)
            )

--if(@id=7) --@id=2,@id=3 and so on (for debug
--break

    SET @i = @i + 1
    SET @ID = @I
END

SELECT * 
FROM @Tbl

, 56 000 rownum = 1. , , .

;with CTE as
(
    select a.ID,a.FirstName,a.LastName,a.DateOfBirth,a.BirthCountry
    ,@StartGroupid GroupID 
    ,1 rn
    FROM @Tbl A where a.id=1


UNION ALL

Select a.ID,a.FirstName,a.LastName,a.DateOfBirth,a.BirthCountry


 ,case when ((a.BirthCountry = b.BirthCountry and a.DateOfBirth = b.dateofbirth)
            or (a.LastName = b.LastName and a.BirthCountry = b.BirthCountry)
            or (a.LastName = b.LastName and a.dateofbirth = b.dateofbirth)
            ) then b.groupid  else b.groupid+1 end
    , b.rn+1
    FROM @tbl A
   inner join CTE B on a.id>1 

   where b.rn<@Maxid

)
,CTE1 as
(select * ,row_number()over(partition by id order by groupid )rownum 
from CTE )

select * from cte1
where rownum=1
+1

Perhaps you can run it this way

SELECT *
FROM table_name
GROUP BY
    FirstName,
    LastName,
    GroupID
HAVING COUNT(GroupID) >= 2
ORDER BY GroupID
-1
source

All Articles