r-получить отчетливый шаблон последовательности


Я хотел бы восстановить различные состояния последовательности, избавиться от повторений, но при этом сохранить порядок. Позвольте мне объяснить на примере.

Мои 2 последовательности выглядят так:

library(reshape2) 
library(dplyr)    

dt %>% melt(id.vars = 'id') %>% arrange(id)

   id variable          value
1   1       t1       Domestic
2   1       t2       Domestic
3   1       t3       Domestic
4   1       t4       Domestic
5   1       t5       Domestic
6   1       t6            Eat
7   1       t7            Eat
8   1       t8            Eat
9   1       t9            Eat
10  1      t10            Eat
11  1      t11       Domestic

12  2       t1 SocialContacts
13  2       t2         Travel
14  2       t3         Travel
15  2       t4       Domestic
16  2       t5         Travel
17  2       t6         Travel
18  2       t7       Domestic
19  2       t8       Domestic
20  2       t9       Domestic
21  2      t10         Travel
22  2      t11         Travel

То, что я хотел бы это (вывод хотел )

     id             value
    (int)           (chr)
      1             Domestic
      1             Eat
      1             Domestic

      2             SocialContacts
      2             Travel
      2             Domestic
      2             Travel
      2             Domestic
      2             Travel

Пока я добился только этого:

dt %>% melt(id.vars = 'id') %>% group_by(id, value) %>% arrange(id) %>% distinct()

     id variable          value
   (int)   (fctr)          (chr)
1     1       t1       Domestic
2     1       t6            Eat
3     2       t4       Domestic
4     2       t1 SocialContacts
5     2       t2         Travel

Есть идеи ?

dt = structure(list(t1 = c("Domestic", "SocialContacts"), t2 = c("Domestic", 
"Travel"), t3 = c("Domestic", "Travel"), t4 = c("Domestic", "Domestic"
), t5 = c("Domestic", "Travel"), t6 = c("Eat", "Travel"), t7 = c("Eat", 
"Domestic"), t8 = c("Eat", "Domestic"), t9 = c("Eat", "Domestic"
), t10 = c("Eat", "Travel"), t11 = c("Domestic", "Travel"), id = 1:2), .Names= c("t1", 
"t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "t10", "t11", 
"id"), row.names = 1:2, class = "data.frame")
3 2

3 ответа:

A dplyr альтернатива ответу @Psidom:

input <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), variable = c("t1", 
"t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "t10", "t11", 
"t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "t10", 
"t11"), value = c("Domestic", "Domestic", "Domestic", "Domestic", 
"Domestic", "Eat", "Eat", "Eat", "Eat", "Eat", "Domestic", "SocialContacts", 
"Travel", "Travel", "Domestic", "Travel", "Travel", "Domestic", 
"Domestic", "Domestic", "Travel", "Travel")), .Names = c("id", 
"variable", "value"), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", 
"14", "15", "16", "17", "18", "19", "20", "21", "22"))

Код:

library(dplyr)
input %>%
  mutate(grp = cumsum(value != lag(value, default = value[1]))) %>%
  group_by(grp) %>%
  slice(1)
# Source: local data frame [9 x 4]
# Groups: grp [9]
#      id variable          value   grp
#   <int>    <chr>          <chr> <int>
# 1     1       t1       Domestic     0
# 2     1       t6            Eat     1
# 3     1      t11       Domestic     2
# 4     2       t1 SocialContacts     3
# 5     2       t2         Travel     4
# 6     2       t4       Domestic     5
# 7     2       t5         Travel     6
# 8     2       t7       Domestic     7
# 9     2      t10         Travel     8

Вот решение data.table:

library(data.table)    
setDT(dt)
# get secondary id with rleid
dt[, id2:=rleid(value)]
# subset to first rows in secondary id
dt[dt[, .I[1L], by="id2"]$V1,][, id2 := NULL][]

Который выводит

   id variable          value
1:  1       t1       Domestic
2:  1       t6            Eat
3:  1      t11       Domestic
4:  2       t1 SocialContacts
5:  2       t2         Travel
6:  2       t4       Domestic
7:  2       t5         Travel
8:  2       t7       Domestic
9:  2      t10         Travel

Данные

dt <- read.table(header=T, text="   id variable          value
1   1       t1       Domestic
2   1       t2       Domestic
3   1       t3       Domestic
4   1       t4       Domestic
5   1       t5       Domestic
6   1       t6            Eat
7   1       t7            Eat
8   1       t8            Eat
9   1       t9            Eat
10  1      t10            Eat
11  1      t11       Domestic
12  2       t1 SocialContacts
13  2       t2         Travel
14  2       t3         Travel
15  2       t4       Domestic
16  2       t5         Travel
17  2       t6         Travel
18  2       t7       Domestic
19  2       t8       Domestic
20  2       t9       Domestic
21  2      t10         Travel
22  2      t11         Travel")

Использовать rleid из data.table:

library(data.table)
library(dplyr)
dt %>% melt(id.vars = 'id') %>% arrange(id) %>% group_by(id, rleid = rleid(value)) %>% 
       summarise(value = unique(value)) %>% select(-rleid)

#    id          value
# 1   1       Domestic
# 6   1            Eat
# 11  1       Domestic
# 12  2 SocialContacts
# 13  2         Travel
# 15  2       Domestic
# 16  2         Travel
# 18  2       Domestic
# 21  2         Travel

Аналогичный подход с использованием data.table:

library(data.table)
unique(melt(setDT(dt), id.vars = 'id')[order(id)]
  [, .(value), .(id, rleid(value))])[, -'rleid', with=F]

#    id          value
# 1:  1       Domestic
# 2:  1            Eat
# 3:  1       Domestic
# 4:  2 SocialContacts
# 5:  2         Travel
# 6:  2       Domestic
# 7:  2         Travel
# 8:  2       Domestic
# 9:  2         Travel