Come unire record simili con date di validità diverse?


10

La tabella su cui sto lavorando ha tre componenti:

  1. Una IDcolonna (chiave primaria in un'altra tabella)
  2. Alcune colonne di dati
  3. Data valida from/ tocolonne.

Valori:

ID   Data From        To  
1    a    2015-01-01  2015-01-05
1    a    2015-01-06  2015-01-10
1    b    2015-01-11  2015-01-15
1    a    2015-01-16  2015-01-20
2    c    2015-01-01  2015-01-05
2    c    2015-01-06  2015-01-10

La tabella viene aggiornata eseguendo "istantanee" di un'altra origine dati ad alcuni intervalli e assegnando date di validità ai record. Il problema è che queste istantanee creano voci duplicate per i record (con date di validità diverse) che non sono state affatto modificate durante quell'intervallo.

Voglio ridurre le dimensioni della tabella cercando le righe con date consecutive e unendole e assegnando loro un singolo periodo di validità. Per esempio:

ID   Data From        To  
1    a    2015-01-01  2015-01-10
1    b    2015-01-11  2015-01-15
1    a    2015-01-16  2015-01-20
2    c    2015-01-01  2015-01-10

La logica che ho attualmente è:

  1. Seleziona e ordina tutte le righe per ID, campi dati e campi 'validi da' (in modo che siano in gruppi di righe consecutive).
  2. Utilizzare un cursore per confrontare le righe adiacenti per somiglianza.
  3. Se sono uguali, quindi unire le righe e modificare il periodo di validità per includere entrambe le righe.

Capisco che i cursori sono molto inefficienti (ho un set di dati di grandi dimensioni), quindi sto cercando altri approcci.


2
Inoltre: aggiungi la CREATE TABLEdichiarazione nella domanda.
ypercubeᵀᴹ

2
Quanto è grande il set di dati "grande"? Perché non risolvi l'importazione dell'istantanea in modo che non crei il problema in primo luogo?
Paul White 9

Nell'ordine di milioni di record. Non ho i permessi per modificare il modo in cui viene creata la tabella. Inoltre, questo non risolve il problema con i record precedenti.
Hazrmard,

Risposte:


8

Se questa è solo una tabella di intervalli back-to-back, il tuo caso può essere trattato come un classico problema di "lacune e isole", in cui devi solo isolare isole di intervalli consecutivi e poi "condensarle" prendendo il minimo [from]e il massimo [to]per isola.

Esiste un metodo consolidato per risolverlo utilizzando due chiamate ROW_NUMBER:

WITH islands AS
(
  SELECT
    id,
    data,
    [from],
    [to],
    island = ROW_NUMBER() OVER (PARTITION BY id       ORDER BY [from])
           - ROW_NUMBER() OVER (PARTITION BY id, data ORDER BY [from])
  FROM
    #mergeTest
)
SELECT
  id,
  data,
  [from] = MIN([from]),
  [to]   = MAX([to])
FROM
  islands
GROUP BY
  id,
  data,
  island
;

Questa query funzionerà nella versione più bassa di SQL Server 2005.


1

Sono stato in grado di scrivere una query per risolvere questo problema. Utilizza più join e un ciclo while per unire i record. Questo codice è compatibile con SQL Server 2008 R2.

CREATE TABLE #mergeTest
(
    [id] int NOT NULL,
    [data] date,
    [from] date NOT NULL,
    [to] date NOT NULL
);

INSERT INTO #mergeTest ([id],[data],[from],[to]) VALUES     --testing null data value handling
    (1,NULL,'2015-01-01','2015-01-05'), --1
    (1,NULL,'2015-01-05','2015-01-10'), --2
    (1,'2000-01-01','2015-01-10','2015-01-14'), --3
    (1,'2000-01-03','2015-01-14','2015-01-15'), --4
    (1,'2000-01-01','2015-01-15','2015-01-20'), --5
    (1,'2000-01-01','2015-01-20','2015-01-22'), --5
    (1,'2000-01-01','2015-01-22','2015-01-25'), --6
    (1,'2000-01-01','2015-01-25','2015-01-30'), --7
    (1,NULL,'2015-01-30','2015-02-04'), --8
    (2,'2000-01-05','2015-01-01','2015-01-05'), --9
    (2,'2000-01-05','2015-01-05','2015-01-10')  --10

SELECT * FROM #mergeTest 
GO
;

SELECT * INTO #tempSingle                               --isolate single records. Single records need no processing.
    FROM (
        SELECT  [id], [data], MIN([from]) as [from], MIN([to]) as [to],
                COUNT([id]) as [grpsz]
        FROM #mergeTest
        GROUP BY [id], [data]) AS [selection]
    WHERE [grpsz]=1;
ALTER TABLE #tempSingle
    DROP COLUMN [grpsz];
GO
;

SELECT * INTO #tempRemainingtemp                        --isolate records w/ more than 2 entries. They need to be reduced to single records
    FROM (
        SELECT  [id], [data],                           --get [id] and [data] of duplicate records
                COUNT([id]) as [grpsz]
        FROM #mergeTest
        GROUP BY [id], [data]) AS [selection]
    WHERE [grpsz]>=2;
ALTER TABLE #tempRemainingTemp
    DROP COLUMN [grpsz]
SELECT * FROM #tempRemainingtemp
SELECT * INTO #temp                                     --get all duplicate records into #temp
    FROM (
        SELECT [b].*
        FROM #tempRemainingtemp AS [a]
        JOIN #mergeTest AS [b]
        ON      [a].[id]=[b].[id]
            AND ([a].[data]=[b].[data] OR [a].[data] IS NULL AND [b].[data] IS NULL)) AS [selection];

DROP TABLE #tempRemainingtemp;
Go
SELECT * INTO #tempRemaining
    FROM #temp;
DROP TABLE #temp;
GO
;
SELECT * FROM #tempRemaining
BEGIN
SELECT t1.*, t2.[from] as [prevfrom] INTO #temp0        --filter in records where previous 'to' date matched current 'from' date when grouped by id and data
    FROM #tempRemaining AS t1
    JOIN #tempRemaining AS t2
    ON      t2.[to] = t1.[from]
        AND t1.[id] = t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL)

SELECT t1.*, t2.[prevfrom] INTO #temp1                  --add records that did not have a previous 'to' date b/c they were the extreme records in their group
    FROM #tempRemaining AS t1
    LEFT JOIN #temp0 AS t2
    ON      t1.[id]=t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL)
        AND t1.[from] = t2.[from];

DROP TABLE #temp0;

SELECT t1.*, t2.[to] as [nextto] INTO #temp2            --filter in records where current 'to' date matched next 'from' date when grouped by id and data
    FROM #temp1 AS t1
    JOIN #temp1 AS t2
    ON      t2.[from] = t1.[to]
        AND t1.[id] = t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL);

SELECT t1.*, t2.[nextto] INTO #temp                     --add records that did not have a next 'from' date b/c they were the extreme records in their group
    FROM #temp1 AS t1
    LEFT JOIN #temp2 AS t2
    ON      t1.[id]=t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL)
        AND t1.[from] = t2.[from];

DROP TABLE #temp2;
DROP TABLE #temp1;

DELETE FROM #temp                                       --delete redundant records
    WHERE   [prevfrom] IS NOT NULL
        AND [nextto] IS NOT NULL;

WITH cte AS (                                           --select records that got reduced to singles and insert them into singles account
    SELECT [id], [data], [from], [to]
        FROM [#temp]
        WHERE   [prevfrom] IS NULL
            AND [nextto] IS NULL)
DELETE FROM cte
OUTPUT deleted.* INTO #tempSingle

/* ALL DUPLICATE RECORDS ARE NOW REDUCED TO PAIRS*/

SELECT * FROM #temp;
ALTER TABLE #temp
    DROP COLUMN [nextto],[prevfrom]                     --remove helper columns
END

SELECT TOP 1 * INTO #temptemp                           --create temporary tables for storage
    FROM #temp
SELECT TOP 1 * INTO #tempResult
    FROM #temp
TRUNCATE TABLE #temptemp
TRUNCATE TABLE #tempResult

WHILE EXISTS(SELECT [id] from #temp)
BEGIN
    WITH cte AS (
            SELECT TOP 2 *                              --select pair
                FROM #temp
                ORDER BY [id],[data],[from])
        DELETE FROM cte                                 --delete from original table
        OUTPUT deleted.* INTO #temptemp;
    INSERT INTO #tempResult                             --insert merged record into result table
        SELECT t1.[id], t1.[data], t1.[from], t2.[to]
        FROM #temptemp AS t1
        JOIN #temptemp AS t2
        ON t1.[from]<t2.[from];
    TRUNCATE TABLE #temptemp;                           --empty temporary storage table
END;

TRUNCATE TABLE #mergeTest;                              --insert single records and merged records into original table
INSERT INTO #mergeTest
    SELECT * FROM #tempResult;
INSERT INTO #mergeTest
    SELECT * FROM #tempSingle;

SELECT * FROM #mergeTest
    ORDER BY [id],[from];

0

Solo nel caso in cui tu abbia intervalli di date non contigui che, sebbene consecutivi, debbano rimanere separati, ho trovato questa soluzione:

Vedi su SQL Fiddle

WITH lag_info AS (
  SELECT
    ID,
    Data,
    [From],
    [To],
    lag([To], 1, NULL) OVER (PARTITION BY ID ORDER BY [From]) AS PrevTo,
    lag(Data, 1, NULL) OVER (PARTITION BY ID ORDER BY [From]) AS PrevData
  FROM dat
),
segmented AS (
  SELECT
    ID,
    Data,
    [From],
    [To],
    -- new interval if non-contigous or data changed
    -- if it's null, it means that it's the first entry for the ID, which means it's a new interval
    CASE
      WHEN [PrevTo] IS NULL
        OR PrevData IS NULL
        OR DATEDIFF(DAY, [PrevTo], [From]) > 1
        OR Data <> PrevData
      THEN 1
      ELSE 0
    END AS is_new_interval
  FROM lag_info
),
segmented_marked AS (
  SELECT
    ID,
    [From],
    [To],
    Data,
    -- increment only when new data is detected, using a running sum
    sum(s.is_new_interval)
      OVER (PARTITION BY ID ORDER BY [From] ROWS BETWEEN UNBOUNDED PRECEDING AND 0 FOLLOWING)
                                AS interval_id
  FROM segmented s
)
SELECT
  ID,
  min([From]) AS [From],
  max([To]) AS [To],
  Data
FROM segmented_marked
GROUP BY ID, Data, interval_id

-1

Ho scritto una query che sembra funzionare. Utilizza espressioni di tabella comuni, istruzioni MERGE e funzioni analitiche. Tuttavia, è compatibile solo con SQL Server 2012+. Puoi trovare l'essenza qui: MergeRecordsByValidityDate.sql

/*  NOTE: Only works w/ SQL Server 2012+
    Merging identical records with different validity dates.
*/
USE [master]


IF OBJECT_ID('mergeTest') IS NOT NULL
    DROP TABLE mergeTest

CREATE TABLE mergeTest          -- Create table with test data
(
    [id] int NOT NULL,
    [data] char(1) NOT NULL,
    [from] date NOT NULL,
    [to] date NOT NULL
);

INSERT INTO mergeTest ([id],[data],[from],[to]) VALUES      -- Insert records w/ different validity dates
    (1,'a','2015-01-01','2015-01-05'),  --1
    (1,'a','2015-01-05','2015-01-10'),  --2
    (1,'a','2015-01-10','2015-01-14'),  --3
    (1,'b','2015-01-14','2015-01-15'),  --4
    (1,'a','2015-01-15','2015-01-20'),  --5
    (1,'a','2015-01-20','2015-01-25'),  --6
    (1,'a','2015-01-25','2015-01-30'),  --7
    (1,'a','2015-01-30','2015-02-04'),  --8
    (2,'c','2015-01-01','2015-01-05'),  --9
    (2,'c','2015-01-05','2015-01-10')   --10

SELECT * FROM mergeTest

/*  This SELECT function uses a Common Table Expression along with Analytic functions over a partition.
    The data set is partitioned on similar primary key and data columns and ordered by 'from' dates.
    A 'last' and 'next' column is added with 'to' date of prev row and 'from' date of next row.
    For each partition, rows are selected (for each partition) that represent the first and last records 
    of identical data. For e.g. rows 5,6,7,8 are reduced to 5,8.
*/

;WITH partitionedData AS (
    SELECT *,   LAG([to],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [last],
                LEAD([from],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [next]
    FROM mergeTest)
SELECT [id],[data],[from],[to],[last],[next] INTO #temp
    FROM partitionedData
    WHERE [last] IS NULL OR [next] IS NULL OR [last]<>[from] OR [next]<>[to]
;

SELECT * FROM #temp

/*  Now all redundant 'sandwiched' records have been filtered out, only the extreme records are left.
    This MERGE function matches rows on primary key and data, and If the 'to' date of said record matches
    'from' date of another similar record, then the said record is extended to encapsulate the other record's
    'to' date. For example row 5's 'to' date is extended to equal row 8's 'to' date.
*/

MERGE INTO #temp as m1
    USING #temp as m2
    ON m1.id=m2.id AND m1.data=m2.data
WHEN MATCHED
    AND (m1.[to]=m2.[from])
    THEN
    UPDATE SET  m1.[to]=m2.[to]
;

SELECT * FROM #temp

/*  The MERGE function has done its job of extending records. However there are still 2 records with
    identical data. For e.g. rows 9,10 exist even though row 9 now has all the required information. This 
    block modifies such redundant rows so their 'last' and 'from' columns become asynchronous.
*/

;WITH repartitionedData AS (
    SELECT [id],[data],[from],[to], LAG([to],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [last],
                LEAD([from],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [next]
    FROM #temp)
SELECT [id],[data],[from],[to],[last],[next] INTO #temptemp
    FROM repartitionedData
    WHERE [last] IS NULL OR [next] IS NULL OR [last]<>[from] OR [next]<>[to]
;

SELECT * FROM #temptemp

/* Asynchronous rows are deleted
*/

DELETE FROM #temptemp
    WHERE [from]<[last]

SELECT * FROM #temptemp

/*  However, blocks of data with >2 rows (like rows 5 through 8) could not be merged because of the filtered out
    rows (i.e. rows 6,7). Applying MERGE again on the updated data set.
*/

MERGE INTO #temptemp as m1
    USING #temptemp as m2
    ON m1.id=m2.id AND m1.data=m2.data
WHEN MATCHED
    AND (m1.[from]=m2.[next])
    THEN
    UPDATE SET  m1.[from]=m2.[from],
                m1.[last]=CASE WHEN ((m2.[last] IS NULL) OR (m2.[next] IS NULL)) THEN NULL ELSE m1.[last] END   --if row absorbing from is extreme, then current row is also extreme
;

SELECT * FROM #temptemp

TRUNCATE TABLE mergeTest        -- resetting original table

/* The MERGE corrected all rows with the correct 'from' and 'to' dates. And the only rows we are interested in are
    the extreme rows i.e. with 'last' or 'next' == NULL. SELECTing on that criterion and INSERTing into original table.
*/

INSERT INTO mergeTest           -- inserting processed records into table + some last minute filtering
    SELECT [id],[data],[from],MAX([to])
    FROM #temptemp
        WHERE [next] IS NULL OR [last] IS NULL
    GROUP BY [id],[data],[from]

SELECT * FROM mergeTest

DROP TABLE #temp
DROP TABLE #temptemp
Utilizzando il nostro sito, riconosci di aver letto e compreso le nostre Informativa sui cookie e Informativa sulla privacy.
Licensed under cc by-sa 3.0 with attribution required.