Conteggio e somma delle sequenze numeriche positive e negative

31

Voglio scrivere un codice per contare e sommare qualsiasi serie di numeri positivi e negativi.
I numeri sono positivi o negativi (nessuno zero).
Ho scritto codici con forloop. C'è qualche alternativa creativa?

Dati

R

set.seed(100)
x <- round(rnorm(20, sd = 0.02), 3)

pitone

x = [-0.01, 0.003, -0.002, 0.018, 0.002, 0.006, -0.012, 0.014, -0.017, -0.007,

     0.002, 0.002, -0.004, 0.015, 0.002, -0.001, -0.008, 0.01, -0.018, 0.046]

loop

R

sign_indicator <- ifelse(x > 0, 1,-1)
number_of_sequence <- rep(NA, 20)
n <- 1
for (i in 2:20) {
  if (sign_indicator[i] == sign_indicator[i - 1]) {
    n <- n + 1
  } else{
    n <- 1
  }
  number_of_sequence[i] <- n

}
number_of_sequence[1] <- 1

#############################

summation <- rep(NA, 20)

for (i in 1:20) {
  summation[i] <- sum(x[i:(i + 1 - number_of_sequence[i])])
}

pitone

sign_indicator = [1 if i > 0 else -1 for i in X]

number_of_sequence = [1]
N = 1
for i in range(1, len(sign_indicator)):
    if sign_indicator[i] == sign_indicator[i - 1]:
        N += 1
    else:
        N = 1
    number_of_sequence.append(N)

#############################
summation = []

for i in range(len(X)):
    if number_of_sequence[i] == 1:          
          summation.append(X[i])

    else:
        summation.append(sum(X[(i + 1 - number_of_sequence[i]):(i + 1)]))

risultato

        x n_of_sequence    sum
1  -0.010             1 -0.010
2   0.003             1  0.003
3  -0.002             1 -0.002
4   0.018             1  0.018
5   0.002             2  0.020
6   0.006             3  0.026
7  -0.012             1 -0.012
8   0.014             1  0.014
9  -0.017             1 -0.017
10 -0.007             2 -0.024
11  0.002             1  0.002
12  0.002             2  0.004
13 -0.004             1 -0.004
14  0.015             1  0.015
15  0.002             2  0.017
16 -0.001             1 -0.001
17 -0.008             2 -0.009
18  0.010             1  0.010
19 -0.018             1 -0.018
20  0.046             1  0.046

python r

— Iman
fonte

17

Le altre soluzioni sembrano a posto ma non è necessario utilizzare sofisticate funzionalità linguistiche o funzioni di libreria per questo semplice problema.

result, prev = [], None

for idx, cur in enumerate(x):
    if not prev or (prev > 0) != (cur > 0):
        n, summation = 1, cur
    else:
        n, summation = n + 1, summation + cur
    result.append((idx, cur, n, summation))
    prev = cur

Come puoi vedere, non hai davvero bisogno di un sign_indicatorelenco, due for-loop o rangefunzioni come nello snippet nella sezione delle domande.

Se vuoi che l'indice inizi da 1, usa enumerate(x, 1)invece dienumerate(x)

Per vedere il risultato, è possibile eseguire il codice seguente

for idx, num, length, summation in result:
     print(f"{idx: >2d} {num: .3f} {length: >2d} {summation: .3f}")

— bombe
fonte

14

In R, puoi usare data.tables rleidper creare gruppi con serie di numeri positivi e negativi, quindi creare una sequenza di righe in ciascun gruppo ed eseguire una somma cumulativa dei xvalori.

library(data.table)
df <- data.table(x)
df[, c("n_of_sequence", "sum") := list(seq_len(.N), cumsum(x)), by = rleid(sign(x))]
df

#         x n_of_sequence    sum
# 1: -0.010             1 -0.010
# 2:  0.003             1  0.003
# 3: -0.002             1 -0.002
# 4:  0.018             1  0.018
# 5:  0.002             2  0.020
# 6:  0.006             3  0.026
# 7: -0.012             1 -0.012
# 8:  0.014             1  0.014
# 9: -0.017             1 -0.017
#10: -0.007             2 -0.024
#11:  0.002             1  0.002
#12:  0.002             2  0.004
#13: -0.004             1 -0.004
#14:  0.015             1  0.015
#15:  0.002             2  0.017
#16: -0.001             1 -0.001
#17: -0.008             2 -0.009
#18:  0.010             1  0.010
#19: -0.018             1 -0.018
#20:  0.046             1  0.046

Possiamo usare rleidin dplyrcosì di creare gruppi e fare lo stesso.

library(dplyr)
df %>%
  group_by(gr = data.table::rleid(sign(x))) %>%
  mutate(n_of_sequence = row_number(), sum = cumsum(x))

— Ronak Shah
fonte

2

n_of_sequencenon è identico a quello desiderato

— Iman,

@Iman Siamo spiacenti, ho letto male l'output in precedenza. L'ho corretto ora.

— Ronak Shah,

10

Puoi calcolare le lunghezze di esecuzione di ciascun segno usando rleda basea e fare qualcosa del genere.

set.seed(0)
z <- round(rnorm(20, sd = 0.02), 3)
run_lengths <- rle(sign(z))$lengths
run_lengths
# [1] 1 1 1 3 1 1 2 2 1 2 2 1 1 1

Ottenere n_of_sequence

n_of_sequence <- run_lengths %>% map(seq) %>% unlist
n_of_sequence
# [1] 1 1 1 1 2 3 1 1 1 2 1 2 1 1 2 1 2 1 1 1

Infine, per ottenere le somme delle sequenze,

start <- cumsum(c(1,run_lengths))
start <- start[-length(start)] # start points of each series 
map2(start,run_lengths,~cumsum(z[.x:(.x+.y-1)])) %>% unlist()
# [1] -0.010  0.003 -0.002  0.018  0.020  0.026 -0.012  0.014 -0.017 -0.024
# [11]  0.002  0.004 -0.004  0.015  0.017 -0.001 -0.009  0.010 -0.018  0.046

— Ameer
fonte

6

Ecco una semplice funzione senza loop in R:

count_and_sum <- function(x)
{
  runs   <- rle((x > 0) * 1)$lengths
  groups <- split(x, rep(1:length(runs), runs))
  output <- function(group) data.frame(x = group, n = seq_along(group), sum = cumsum(group))
  result <- as.data.frame(do.call(rbind, lapply(groups, output)))
  `rownames<-`(result, 1:nrow(result))
}

Quindi puoi fare:

set.seed(100)
x <- round(rnorm(20, sd = 0.02), 3)
count_and_sum(x)
#>         x n    sum
#> 1  -0.010 1 -0.010
#> 2   0.003 1  0.003
#> 3  -0.002 1 -0.002
#> 4   0.018 1  0.018
#> 5   0.002 2  0.020
#> 6   0.006 3  0.026
#> 7  -0.012 1 -0.012
#> 8   0.014 1  0.014
#> 9  -0.017 1 -0.017
#> 10 -0.007 2 -0.024
#> 11  0.002 1  0.002
#> 12  0.002 2  0.004
#> 13 -0.004 1 -0.004
#> 14  0.015 1  0.015
#> 15  0.002 2  0.017
#> 16 -0.001 1 -0.001
#> 17 -0.008 2 -0.009
#> 18  0.010 1  0.010
#> 19 -0.018 1 -0.018
#> 20  0.046 1  0.046

^{Creato il 2020-02-16 dal pacchetto reprex (v0.3.0)}

— Allan Cameron
fonte

5

Ecco una tidyversesoluzione semplice ...

library(tidyverse) #or just dplyr and tidyr

set.seed(100)
x <- round(rnorm(20, sd = 0.02), 3)

df <- tibble(x = x) %>% 
  mutate(seqno = cumsum(c(1, diff(sign(x)) != 0))) %>% #identify sequence ids
  group_by(seqno) %>%                                  #group by sequences
  mutate(n_of_sequence = row_number(),                 #count row numbers for each group
         sum = cumsum(x)) %>%                          #cumulative sum for each group
  ungroup() %>% 
  select(-seqno)                                       #remove sequence id

df
# A tibble: 20 x 3
        x n_of_sequence     sum
    <dbl>         <int>   <dbl>
 1 -0.01              1 -0.01  
 2  0.003             1  0.003 
 3 -0.002             1 -0.002 
 4  0.018             1  0.018 
 5  0.002             2  0.0200
 6  0.006             3  0.026 
 7 -0.012             1 -0.012 
 8  0.014             1  0.014 
 9 -0.017             1 -0.017 
10 -0.007             2 -0.024 
11  0.002             1  0.002 
12  0.002             2  0.004 
13 -0.004             1 -0.004 
14  0.015             1  0.015 
15  0.002             2  0.017 
16 -0.001             1 -0.001 
17 -0.008             2 -0.009 
18  0.01              1  0.01  
19 -0.018             1 -0.018 
20  0.046             1  0.046

— Andrew Gustar
fonte

5

Per quanto riguarda Python, qualcuno troverà una soluzione usando la libreria panda. Nel frattempo, ecco una semplice proposta:

class Combiner:
    def __init__(self):
        self.index = self.seq_index = self.summation = 0

    def combine(self, value):
        self.index += 1
        if value * self.summation <= 0:
            self.seq_index = 1
            self.summation = value
        else:
            self.seq_index += 1
            self.summation += value
        return self.index, value, self.seq_index, self.summation

c = Combiner()
lst = [c.combine(v) for v in x]

for t in lst:
    print(f"{t[0]:3} {t[1]:7.3f} {t[2]:3} {t[3]:7.3f}")

Produzione:

  1  -0.010   1  -0.010
  2   0.003   1   0.003
  3  -0.002   1  -0.002
  4   0.018   1   0.018
  5   0.002   2   0.020
  6   0.006   3   0.026
  7  -0.012   1  -0.012
  8   0.014   1   0.014
  9  -0.017   1  -0.017
 10  -0.007   2  -0.024
 11   0.002   1   0.002
 12   0.002   2   0.004
 13  -0.004   1  -0.004
 14   0.015   1   0.015
 15   0.002   2   0.017
 16  -0.001   1  -0.001
 17  -0.008   2  -0.009
 18   0.010   1   0.010
 19  -0.018   1  -0.018
 20   0.046   1   0.046

Se hai bisogno di elenchi separati, puoi farlo

idxs, vals, seqs, sums = (list(tpl) for tpl in zip(*lst))

o, se gli iteratori sono OK, semplicemente

idxs, vals, seqs, sums = zip(*lst)

(spiegazione qui )

— Walter Tross
fonte

5

Due diverse soluzioni pigre in Python, usando il modulo itertools .

Usare itertools.groupby (e accumulare)

from itertools import accumulate, groupby

result = (
    item
    for _, group in groupby(x, key=lambda n: n < 0)
    for item in enumerate(accumulate(group), 1)
)

Utilizzo di itertools.accumulate con una funzione di accumulazione personalizzata

from itertools import accumulate

def sign_count_sum(count_sum, value):
    count, prev_sum = count_sum
    same_sign = (prev_sum < 0) is (value < 0)
    if same_sign:
        return count + 1, prev_sum + value
    else:
        return 1, value

result = accumulate(x, sign_count_sum, initial=(0, 0))
next(result)  # needed to skip the initial (0, 0) item

L' initialargomento della parola chiave è stato aggiunto in Python 3.8. Nelle versioni precedenti puoi usare itertools.chainper anteporre la (0,0) -tupla:

result = accumulate(chain([(0, 0)], x), sign_count_sum)

L'output è come previsto:

for (i, v), (c, s) in zip(enumerate(x), result):
    print(f"{i:3} {v:7.3f} {c:3} {s:7.3f}")

  0  -0.010   1  -0.010
  1   0.003   1   0.003
  2  -0.002   1  -0.002
  3   0.018   1   0.018
  4   0.002   2   0.020
  5   0.006   3   0.026
  6  -0.012   1  -0.012
  7   0.014   1   0.014
  8  -0.017   1  -0.017
  9  -0.007   2  -0.024
 10   0.002   1   0.002
 11   0.002   2   0.004
 12  -0.004   1  -0.004
 13   0.015   1   0.015
 14   0.002   2   0.017
 15  -0.001   1  -0.001
 16  -0.008   2  -0.009
 17   0.010   1   0.010
 18  -0.018   1  -0.018
 19   0.046   1   0.046

— Schot
fonte

5

Consiglio questo runner R per questo tipo di operazioni. streak_run calcola l'occorrenza consecutiva dello stesso valore e sum_run calcola la somma nella finestra quale lunghezza è definita kdall'argomento.

Ecco la soluzione:

set.seed(100)
x <- round(rnorm(20, sd = 0.02), 3)

n_of_sequence <- runner::streak_run(x > 0)
sum <- runner::sum_run(x, k = n_of_sequence)

data.frame(x, n_of_sequence, sum)

#         x n_of_sequence    sum
# 1  -0.010             1 -0.010
# 2   0.003             1  0.003
# 3  -0.002             1 -0.002
# 4   0.018             1  0.018
# 5   0.002             2  0.020
# 6   0.006             3  0.026
# 7  -0.012             1 -0.012
# 8   0.014             1  0.014
# 9  -0.017             1 -0.017
# 10 -0.007             2 -0.024
# 11  0.002             1  0.002
# 12  0.002             2  0.004
# 13 -0.004             1 -0.004
# 14  0.015             1  0.015
# 15  0.002             2  0.017
# 16 -0.001             1 -0.001
# 17 -0.008             2 -0.009
# 18  0.010             1  0.010
# 19 -0.018             1 -0.018
# 20  0.046             1  0.046

Sotto il benchmark per confrontare le soluzioni attuali

set.seed(0)
x <- round(rnorm(10000, sd = 0.02), 3)

library(runner)
runner_streak <- function(x) {
  n_of_sequence <- streak_run(x > 0)
  sum <- sum_run(x, k = n_of_sequence)
}

library(data.table)
dt <- data.table(x)
dt_streak <- function(dt) {
  dt[, c("n_of_sequence", "sum") := list(seq_len(.N), cumsum(x)),rleid(sign(x))]
}

rle_streak <- function(x) {
  run_lengths <- rle(sign(x))$lengths
  run_lengths

  n_of_sequence <- run_lengths %>% map(seq) %>% unlist

  start <- cumsum(c(1,run_lengths))
  start <- start[-length(start)]
  sum <- map2(start,run_lengths,~cumsum(x[.x:(.x+.y-1)])) %>% unlist()
}

library(tidyverse)
df <- tibble(x = x)
tv_streak <- function(x) {
  res <- df %>%
    mutate(seqno = cumsum(c(1, diff(sign(x)) != 0))) %>%
    group_by(seqno) %>%
    mutate(n_of_sequence = row_number(),
           sum = cumsum(x)) %>%
    ungroup() %>% 
    select(-seqno)  
}

count_and_sum <- function(x) {
  runs   <- rle((x > 0) * 1)$lengths
  groups <- split(x, rep(1:length(runs), runs))
  output <- function(group) 
    data.frame(x = group, n = seq_along(group), sum = cumsum(group))
  result <- as.data.frame(do.call(rbind, lapply(groups, output)))
  `rownames<-`(result, 1:nrow(result))
}

microbenchmark::microbenchmark(
  runner_streak(x),
  dt_streak(dt),
  rle_streak(x),
  tv_streak(df),
  count_and_sum(x),
  times = 100L
)


# Unit: milliseconds
#             expr         min          lq        mean      median          uq        max neval
# runner_streak(x)    4.240192    4.833563    6.321697    5.300817    6.543926   14.80221   100
#    dt_streak(dt)    7.648100    8.587887   10.862806    9.650483   11.295488   34.66027   100
#    rle_streak(x)   42.321506   55.397586   64.195692   63.404403   67.813738  167.71444   100
#    tv_streak(df)   31.398885   36.333751   45.141452   40.800077   45.756279  163.19535   100
# count_and_sum(x) 1691.438977 1919.518282 2306.036783 2149.543281 2499.951020 6158.43384   100

— GoGonzo
fonte

1

misurare in microsecondi non ha molto senso. Alcune funzioni hanno un sovraccarico iniziale in microsecondi, ma scaleranno per i set di big data molto meglio di altre. Inoltre df <- data.table(x)è una copia completa dei dati. Inoltre, stai stampando i dati in alcuni esempi (che è un'altra copia completa) mentre non in altri.

— David Arenburg,

Hai ragione, risolto.

— GoGonzo,

alcune funzioni restituiscono oggetti diversi - alcuni vettori e alcuni frame di dati - quindi non è ancora un benchmark abbastanza equo. Anche alcuni danno risultati diversi. Prova r = runner_streak(x); d = dt_streak(dt) ; all.equal(r, d$sum). Solo controllato alcuni bbut tv_streakdà lo stesso di dt_streak; count_and_sumdà lo stesso runner_streakche sono diversi dai due precedenti.

— user2957945

3

In R, puoi anche fare:

# DATA
set.seed(100)
x <- round(rnorm(20, sd = 0.02), 3)

library(data.table)
dt <- data.table(x = x)

# Create Positive or Negative variable
dt$x_logical <- ifelse(dt$x > 0, "P", "N")

# Create a reference data.frame/table to keep continuous counts
seq_dt <- data.frame(val = rle(x = dt$x_logical)$lengths)
seq_dt$id <- 1:nrow(seq_dt)

# Map id in the main data.table and get cumulative sum
dt$id <- rep(seq_dt$id, seq_dt$val)
dt[, csum := cumsum(x), by = "id"]


        x x_logical id   csum
 1: -0.010         N  1 -0.010
 2:  0.003         P  2  0.003
 3: -0.002         N  3 -0.002
 4:  0.018         P  4  0.018
 5:  0.002         P  4  0.020
 6:  0.006         P  4  0.026
 7: -0.012         N  5 -0.012
 8:  0.014         P  6  0.014
 9: -0.017         N  7 -0.017
10: -0.007         N  7 -0.024
11:  0.002         P  8  0.002
12:  0.002         P  8  0.004
13: -0.004         N  9 -0.004
14:  0.015         P 10  0.015
15:  0.002         P 10  0.017
16: -0.001         N 11 -0.001
17: -0.008         N 11 -0.009
18:  0.010         P 12  0.010
19: -0.018         N 13 -0.018
20:  0.046         P 14  0.046

— MKA
fonte

3

Lanciando la mia [r] risposta nel cappello, ottimizzata per la velocità e funziona con qualsiasi lunghezza di x (a differenza di quella del richiedente che era hard coded per la lunghezza 20):

### data 
set.seed(100)
x <- round(rnorm(20, sd = 0.02), 3)

### solution
summation <- c(x[1])
enn <- 1
n_of_seq <- c(enn)
for(i in 2:length(x)){
  first <- x[i]
  second <- summation[i - 1]

  if(sign(first) == sign(second)){
    summation <- c(summation, first + second)
    enn <- enn + 1
  }else{
    summation <- c(summation, first)
    enn <- 1

  }
  n_of_seq <- c(n_of_seq, enn)
  }

E, per confrontare i tempi di esecuzione sul mio attuale computer di lavoro (molto lento), ecco l'output del mio microbenchmark utilizzando tutte le soluzioni R in questo thread. Non sorprende che le soluzioni che producono il maggior numero di copie e conversioni tendessero ad essere più lente.

Unit: microseconds
         expr      min       lq       mean    median       uq      max neval
     my_way()   13.301   19.200   23.38352   21.4010   23.401  20604.0 1e+05
 author_way()   19.702   31.701   40.12371   36.0015   40.502  24393.9 1e+05
      ronak()  856.401 1113.601 1305.36419 1236.8010 1377.501 453191.4 1e+05
      ameer()  388.501  452.002  553.08263  491.3000  548.701 456156.6 1e+05
     andrew() 2007.801 2336.801 2748.57713 2518.1510 2760.302 463175.8 1e+05
      gonzo()   21.901   35.502   48.84946   43.9010   51.001  29519.5 1e+05

-------------- MODIFICA -------------- È stato sottolineato da @nicola che la mia soluzione non è la più veloce per lunghezze maggiori di x - che dovrebbe essere abbastanza ovvio poiché sto facendo continuamente copie di vettori usando chiamate come x <- c (x, y). Ho creato solo la soluzione più veloce per lunghezze = 20 e solo il microbench ha segnato il più basso possibile.

Per fare un confronto più equo, ho modificato tutte le versioni per generare il codice originale nel modo in cui credo sia più veloce, ma accolgo con favore il feedback. Ecco il mio codice di benchmark completo e i risultati per il mio sistema molto lento. Accolgo con favore qualsiasi feedback.

# originally benchmarked a few different lengths
for(pie in c(100000)){


my_way<- function(){
  set.seed(100)
  x <- round(rnorm(pie, sd = 0.02), 3)
summation <- c(x[1])
enn <- 1
n_of_seq <- c(enn)
for(i in 2:length(x)){
  first <- x[i]
  second <- summation[i - 1]

  if(sign(first) == sign(second)){
    summation <- c(summation, first + second)
    enn <- enn + 1
  }else{
    summation <- c(summation, first)
    enn <- 1

  }
  n_of_seq <- c(n_of_seq, enn)
  }

# print(summation)
}




author_way <- function(){
  set.seed(100)
  x <- round(rnorm(pie, sd = 0.02), 3)

  sign_indicator <- ifelse(x > 0, 1,-1)
  sky <- length(x)
  number_of_sequence <- rep(NA, sky)
  n <- 1
  for (i in 2:sky) {
    if (sign_indicator[i] == sign_indicator[i - 1]) {
      n <- n + 1
    } else{
      n <- 1
    }
    number_of_sequence[i] <- n

  }
  number_of_sequence[1] <- 1

  #############################

  summation <- rep(NA, sky)

  for (i in 1:sky) {
    summation[i] <- sum(x[i:(i + 1 - number_of_sequence[i])])
  }
}


# other ppls solutions:




ronak <- function(){
df <- data.table('x' = round(rnorm(pie, sd = 0.02), 3))
df[, c("n_of_sequence", "sum") := list(seq_len(.N), cumsum(x)),rleid(sign(x))]
}



ameer <- function(){
  set.seed(100)
  x <- round(rnorm(pie, sd = 0.02), 3)
  run_lengths <- rle(sign(x))$lengths
  n_of_sequence <- run_lengths %>% map(seq) %>% unlist
  start <- cumsum(c(1,run_lengths))
  start <- start[-length(start)] # start points of each series 
  map2(start,run_lengths,~cumsum(x[.x:(.x+.y-1)])) %>% unlist()

}


count_and_sum <- function(x){
  set.seed(100)
  x <- round(rnorm(pie, sd = 0.02), 3)
  runs   <- rle((x > 0) * 1)$lengths
  groups <- split(x, rep(1:length(runs), runs))
  output <- function(group) data.frame(x = group, n = seq_along(group), sum = cumsum(group))
  result <- as.data.frame(do.call(rbind, lapply(groups, output)))
  `rownames<-`(result, 1:nrow(result))
}



andrew <- function(){
  set.seed(100)
  df <- tibble(x = round(rnorm(pie, sd = 0.02), 3)) %>% 
    mutate(seqno = cumsum(c(1, diff(sign(x)) != 0))) %>% #identify sequence ids
    group_by(seqno) %>%                                  #group by sequences
    mutate(n_of_sequence = row_number(),                 #count row numbers for each group
           sum = cumsum(x)) %>%                          #cumulative sum for each group
    ungroup() %>% 
    select(-seqno) 
}

gonzo <- function(){
  set.seed(100)
  x <- round(rnorm(pie, sd = 0.02), 3)
  n_of_sequence <- runner::streak_run(x > 0)
  sum <- runner::sum_run(x, k = n_of_sequence)
}



mi1 <- microbenchmark(my_way(), author_way(), ronak(), ameer(), andrew(), gonzo(), times = 10)
print(mi1)

}

Come mostrano questi risultati, per lunghezze diverse da quelle per cui ho ottimizzato, la mia versione è lenta. Più lunga è x, più lentamente diventa ridicolmente lenta in tutto ciò che è superiore a 1000. La mia versione preferita è quella di Ronak che è solo la seconda più veloce sul mio sistema. GoGonzo è di gran lunga il più veloce sulla mia macchina a lunghezze maggiori.

Unit: milliseconds
         expr        min         lq        mean      median         uq        max neval
     my_way() 21276.9027 21428.2694 21604.30191 21581.97970 21806.9543 21896.7105    10
 author_way()    82.2465    83.0873    89.42343    84.78315    85.3638   115.4550    10
      ronak()    68.3922    69.3067    70.41924    69.84625    71.3509    74.7070    10
      ameer()   481.4566   509.7552   521.19034   514.77000   530.1121   579.4707    10
     andrew()   200.9654   202.1898   210.84914   206.20465   211.2006   233.7618    10
      gonzo()    27.3317    28.2550    28.66679    28.50535    28.9104    29.9549    10

— Adverse_Event
fonte

Anche le altre risposte funzionano per qualsiasi lunghezza e il tuo benchmark deve avere qualche problema. Rispetto alla data.tablesoluzione di @ Ronak, il tuo ordine di grandezza è più lento per una lunghezza di ~ 100000.

— nicola,

Grazie @nicola, ho solo detto che la soluzione del richiedente ha funzionato per soli 20 articoli, non che qualsiasi altra soluzione non ha funzionato - in effetti lo fanno. Ho anche ottimizzato la velocità per la lunghezza di 20 articoli, quindi la mia richiesta di essere il più veloce finisce lì. Per quello che vale, mi è piaciuta anche la soluzione Ronaks, ma l'autore ha esplicitamente chiesto modi più diversi di risolvere il problema. Ronak è già più veloce anche per una lunghezza di 1000.

— Adverse_Event

E per espandere il microbenchmark. Ho ricodificato il mio benchmark in modo che ogni soluzione stesse creando (x) nel formato che stanno usando, quindi quelli che fanno le variabili generano x nella chiamata tibble, lo stesso per data.table ecc. Ho ricodificato la soluzione originale del richiedente in modo che funzioni per qualsiasi lunghezza (semplicemente salvando la lunghezza di x in una variabile e sostituendo il 20 con essa. L'ho quindi eseguito per una lunghezza di 100.000 per 10 iterazioni. Si noti che il mio computer è suuuuper lento, gira su un processore inter di quinta generazione con ddr3 a 1600 mHz. Sto modificando il mio post con questi risultati.

— Adverse_Event

2

In Python, oltre a definire una classe per memorizzare le variabili di memoria, puoi usare una chiusura per ottenere lo stesso.

def run():
    count = 0
    last_sign = 0

    def sign(i):
        return 1 if i > 0 else -1

    def f(i):
        nonlocal count
        nonlocal last_sign
        if sign(i) == last_sign:
            count = count+1
        else:
            last_sign = sign(i)
            count = 1
        return count

    return f

f = run()
y = [f(i) for i in x]

Nota che funziona solo per Python 3 (in Python 2 penso che non puoi modificare la variabile di chiusura in questo modo). Una cosa simile anche per la sommatoria.

— Prodipta Ghosh
fonte

2

Penso che un ciclo sarebbe più facile da leggere, ma solo per divertimento, ecco una soluzione in Python usando la ricorsione:

x = [-0.01, 0.003, -0.002, 0.018, 0.002, 0.006, -0.012, 0.014, -0.017, -0.007, 0.002, 0.002, -0.004, 0.015, 0.002,
     -0.001, -0.008, 0.01, -0.018, 0.046]


def sign(number):
    return 1 if number > 0 else -1


def sum_previous(pos, result=None):
    if not result:
        result = x[pos]
    else:
        result += x[pos]
    if pos == 0 or sign(x[pos]) != sign(x[pos-1]):
        return result
    else:
        return sum_previous(pos-1, result)


results = [sum_previous(i) for i in range(len(x))]
print(results)

— RogB
fonte

2

Ecco un altro approccio di base R:

data.frame(x,
           n = sequence(rle(sign(x))$lengths),
           sum = Reduce(function(x, y) if (sign(x) == sign(y)) x + y else y, x, accumulate = TRUE))

        x n    sum
1  -0.010 1 -0.010
2   0.003 1  0.003
3  -0.002 1 -0.002
4   0.018 1  0.018
5   0.002 2  0.020
6   0.006 3  0.026
7  -0.012 1 -0.012
8   0.014 1  0.014
9  -0.017 1 -0.017
10 -0.007 2 -0.024
11  0.002 1  0.002
12  0.002 2  0.004
13 -0.004 1 -0.004
14  0.015 1  0.015
15  0.002 2  0.017
16 -0.001 1 -0.001
17 -0.008 2 -0.009
18  0.010 1  0.010
19 -0.018 1 -0.018
20  0.046 1  0.046

— H 1
fonte

Solo per nitpick, Reducenasconde un loop, quindi questa non è una soluzione non-loop.

— nicola,

2

Una semplice risposta Python, ignora il caso 0:

x = [-0.01, 0.003, -0.002, 0.018, 
     0.002, 0.006, -0.012, 0.014, 
     -0.017, -0.007, 0.002, 0.002, 
     -0.004, 0.015, 0.002, -0.001, 
     -0.008, 0.01, -0.018, 0.046]

count = 0
sign_positive = x[0] > 0
sign_count = []
for n in x:
    # the idea is to keep track of the sign and increment the 
    # count if it agrees with the current number we are looking at
    if (n > 0 and sign_positive) or (n < 0 and not sign_positive):
        count = count + 1
    # if it does not, the count goes back to 1
    else:
        count = 1
    # Whether we increased the count or not, we update whether the
    # sign was positive or negative
    sign_positive = n > 0
    sign_count.append(count)

# This is just to reproduce the output 
# (although I find the last repetition of the number unnecessary)    
results = list(zip(x, sign_count))
for i, result in enumerate(results):
    print(f"{i: >2d} {result[0]: .3f} {result[1]: >2d} {result[0]: .3f}")

 0 -0.010  1 -0.010
 1  0.003  1  0.003
 2 -0.002  1 -0.002
 3  0.018  1  0.018
 4  0.002  2  0.002
 5  0.006  3  0.006
 6 -0.012  1 -0.012
 7  0.014  1  0.014
 8 -0.017  1 -0.017
 9 -0.007  2 -0.007
10  0.002  1  0.002
11  0.002  2  0.002
12 -0.004  1 -0.004
13  0.015  1  0.015
14  0.002  2  0.002
15 -0.001  1 -0.001
16 -0.008  2 -0.008
17  0.010  1  0.010
18 -0.018  1 -0.018
19  0.046  1  0.046

Una soluzione un po 'più sofisticata, si occupa anche del caso 0:

# To test the 0 case I am changing two numbers to 0
x = [-0.01, 0.003, -0.002, 0.018, 
     0.002, 0.006, -0.012, 0.014, 
    -0.017, -0.007, 0, 0, 
    -0.004, 0.015, 0.002, -0.001, 
    -0.008, 0.01, -0.018, 0.046]

# The rest is similar
count = 0
# This time we are using a nested ternary assignment 
# to account for the case of 0
# This would be more readable as a function, 
# but what it does is simple
# It returns None if n is 0, 
# True if it is larger than 0 
# and False if it less than 0
sign_positive = None if n == 0 else False if n < 0 else True
sign_count = []
for n in x:
    # We add the case of 0 by adding a third condition where
    # sign_positive was None (meaning the previous
    # number was 0) and the current number is 0.
    if (n > 0 and sign_positive) or \
       (n < 0 and not sign_positive) or \
       (n == 0 and sign_positive == None):
        count = count + 1
    else:
        count = 1
    sign_positive = None if n == 0 else False if n < 0 else True
    sign_count.append(count)
results = list(zip(x, sign_count))
for i, result in enumerate(results):
    print(f"{i: >2d} {result[0]: .3f} {result[1]: >2d} {result[0]: .3f}")

 0 -0.010  1 -0.010
 1  0.003  1  0.003
 2 -0.002  1 -0.002
 3  0.018  1  0.018
 4  0.002  2  0.002
 5  0.006  3  0.006
 6 -0.012  1 -0.012
 7  0.014  1  0.014
 8 -0.017  1 -0.017
 9 -0.007  2 -0.007
10  0.000  1  0.000
11  0.000  2  0.000
12 -0.004  3 -0.004
13  0.015  1  0.015
14  0.002  2  0.002
15 -0.001  1 -0.001
16 -0.008  2 -0.008
17  0.010  1  0.010
18 -0.018  1 -0.018
19  0.046  1  0.046

— Sinan Kurmus
fonte