Group All Related Records in Many to Many Relationship, SQL graph connected components

I thought about using recursive CTE, but, as far as I know, it’s not possible in SQL Server to use UNION to connect anchor member and a recursive member of recursive CTE (I think it’s possible to do in PostgreSQL), so it’s not possible to eliminate duplicates.

declare @i int

with cte as (
     select
         GroupID,
         row_number() over(order by Company) as rn
     from Table1
)
update cte set GroupID = rn

select @i = @@rowcount

-- while some rows updated
while @i > 0
begin
    update T1 set
        GroupID = T2.GroupID
    from Table1 as T1
        inner join (
            select T2.Company, min(T2.GroupID) as GroupID
            from Table1 as T2
            group by T2.Company
        ) as T2 on T2.Company = T1.Company
    where T1.GroupID > T2.GroupID

    select @i = @@rowcount

    update T1 set
        GroupID = T2.GroupID
    from Table1 as T1
        inner join (
            select T2.Publisher, min(T2.GroupID) as GroupID
            from Table1 as T2
            group by T2.Publisher
        ) as T2 on T2.Publisher = T1.Publisher
    where T1.GroupID > T2.GroupID

    -- will be > 0 if any rows updated
    select @i = @i + @@rowcount
end

;with cte as (
     select
         GroupID,
         dense_rank() over(order by GroupID) as rn
     from Table1
)
update cte set GroupID = rn

sql fiddle demo

I’ve also tried a breadth first search algorithm. I thought it could be faster (it’s better in terms of complexity), so I’ll provide a solution here. I’ve found that it’s not faster than SQL approach, though:

declare @Company nvarchar(2), @Publisher nvarchar(2), @GroupID int

declare @Queue table (
    Company nvarchar(2), Publisher nvarchar(2), ID int identity(1, 1),
    primary key(Company, Publisher)
)

select @GroupID = 0

while 1 = 1
begin
    select top 1 @Company = Company, @Publisher = Publisher
    from Table1
    where GroupID is null

    if @@rowcount = 0 break

    select @GroupID = @GroupID + 1

    insert into @Queue(Company, Publisher)
    select @Company, @Publisher

    while 1 = 1
    begin
        select top 1 @Company = Company, @Publisher = Publisher
        from @Queue
        order by ID asc

        if @@rowcount = 0 break

        update Table1 set
            GroupID = @GroupID
        where Company = @Company and Publisher = @Publisher

        delete from @Queue where Company = @Company and Publisher = @Publisher

        ;with cte as (
            select Company, Publisher from Table1 where Company = @Company and GroupID is null
            union all
            select Company, Publisher from Table1 where Publisher = @Publisher and GroupID is null
        )
        insert into @Queue(Company, Publisher)
        select distinct c.Company, c.Publisher
        from cte as c
        where not exists (select * from @Queue as q where q.Company = c.Company and q.Publisher = c.Publisher)
   end
end

sql fiddle demo

I’ve tested my version and Gordon Linoff’s to check how it’s perform. It looks like CTE is much worse, I couldn’t wait while it’s complete on more than 1000 rows.

Here’s sql fiddle demo with random data. My results were:

128 rows:

my RBAR solution: 190ms

my SQL solution: 27ms

Gordon Linoff’s solution: 958ms

256 rows:

my RBAR solution: 560ms

my SQL solution: 1226ms

Gordon Linoff’s solution: 45371ms

It’s random data, so results may be not very consistent. I think timing could be changed by indexes, but don’t think it could change a whole picture.

old version – using temporary table, just calculating GroupID without touching initial table:

declare @i int

-- creating table to gather all possible GroupID for each row
create table #Temp
(
    Company varchar(1), Publisher varchar(1), GroupID varchar(1),
    primary key (Company, Publisher, GroupID)
)

-- initializing it with data
insert into #Temp (Company, Publisher, GroupID)
select Company, Publisher, Company
from Table1

select @i = @@rowcount

-- while some rows inserted into #Temp
while @i > 0
begin
    -- expand #Temp in both directions
    ;with cte as (
        select
            T2.Company, T1.Publisher,
            T1.GroupID as GroupID1, T2.GroupID as GroupID2
        from #Temp as T1
            inner join #Temp as T2 on T2.Company = T1.Company
        union
        select
            T1.Company, T2.Publisher,
            T1.GroupID as GroupID1, T2.GroupID as GroupID2
        from #Temp as T1
            inner join #Temp as T2 on T2.Publisher = T1.Publisher        
    ), cte2 as (
        select
            Company, Publisher,
            case when GroupID1 < GroupID2 then GroupID1 else GroupID2 end as GroupID
        from cte
    )
    insert into #Temp
    select Company, Publisher, GroupID
    from cte2
    -- don't insert duplicates
    except
    select Company, Publisher, GroupID
    from #Temp

    -- will be > 0 if any row inserted
    select @i = @@rowcount
end

select
    Company, Publisher,
    dense_rank() over(order by min(GroupID)) as GroupID
from #Temp
group by Company, Publisher

=> sql fiddle example

Leave a Comment