R cut Function


cut() function divides a numeric vector into different ranges.

cut(x, breaks, labels = NULL,
include.lowest = FALSE, right = TRUE, dig.lab = 3,
ordered_result = FALSE, ...)


• x: numeric vector
• breaks: break points, number or numeric vector.
• labels: level labels, character vector.
• include.lowest: logical, the lowest (or highest, for right = FALSE) breaks value included or not
• right: logical, the intervals should be closed on the right (and open on the left) or vice versa

> x <- stats::rnorm(100)
> x

[1] -0.154103462 0.271704132 -0.234160855 0.764474679 0.438237645
[6] -0.763854668 1.303402711 0.051660328 1.064258570 0.079144697
[11] -0.704381407 2.239763673 -0.749203152 0.601148921 -0.174814689
[16] 0.100238929 0.670921777 -0.351881772 -1.452691553 0.774250401
[21] 0.985238459 -0.159947063 0.456925349 0.062732203 -0.139094156
[26] -0.021987877 -0.369758710 -0.623015605 0.818971164 1.024360342
[31] -1.180039385 -1.126115746 -1.331609773 0.261068252 0.306040509
[36] 0.186887898 0.039764640 0.618133561 0.808466877 1.530479825
[41] -0.326594787 -0.525549355 -0.038649831 -0.320394434 -0.116615568
[46] -0.928403864 1.284014444 0.559523194 0.511753047 -0.093609863
[51] -1.199423552 -0.358438485 -1.421215594 -0.199430722 -1.285244671
[56] -0.344308069 0.202383513 -1.044830704 0.009940864 -1.083693166
[61] 0.985718206 0.942167477 0.077569581 1.456191918 -1.385394960
[66] -0.174887806 -0.869293103 1.051227075 -0.726361522 0.082628666
[71] 1.275779587 0.258221666 -0.629207453 -0.589352154 -0.818233970
[76] 0.028423636 -0.491220068 0.796916741 -1.407925480 0.765093431
[81] -0.263630781 0.854937357 0.592710059 -0.095388956 -1.064601796
[86] 0.691149856 0.822038961 0.666786287 -1.062610036 -2.833961199
[91] 1.570993774 -0.876630726 -0.343492831 -0.480549452 1.494723381
[96] -2.025528709 0.949853574 -0.917568904 -1.103676434 0.728284402


Divide the data into ranges -5 ~ 5:

> c <- cut(x,breaks=-5:5)
> c

[1] (-1,0] (0,1] (-1,0] (0,1] (0,1] (-1,0] (1,2] (0,1] (1,2]
[10] (0,1] (-1,0] (2,3] (-1,0] (0,1] (-1,0] (0,1] (0,1] (-1,0]
[19] (-2,-1] (0,1] (0,1] (-1,0] (0,1] (0,1] (-1,0] (-1,0] (-1,0]
[28] (-1,0] (0,1] (1,2] (-2,-1] (-2,-1] (-2,-1] (0,1] (0,1] (0,1]
[37] (0,1] (0,1] (0,1] (1,2] (-1,0] (-1,0] (-1,0] (-1,0] (-1,0]
[46] (-1,0] (1,2] (0,1] (0,1] (-1,0] (-2,-1] (-1,0] (-2,-1] (-1,0]
[55] (-2,-1] (-1,0] (0,1] (-2,-1] (0,1] (-2,-1] (0,1] (0,1] (0,1]
[64] (1,2] (-2,-1] (-1,0] (-1,0] (1,2] (-1,0] (0,1] (1,2] (0,1]
[73] (-1,0] (-1,0] (-1,0] (0,1] (-1,0] (0,1] (-2,-1] (0,1] (-1,0]
[82] (0,1] (0,1] (-1,0] (-2,-1] (0,1] (0,1] (0,1] (-2,-1] (-3,-2]
[91] (1,2] (-1,0] (-1,0] (-1,0] (1,2] (-3,-2] (0,1] (-1,0] (-2,-1]
[100] (0,1]
10 Levels: (-5,-4] (-4,-3] (-3,-2] (-2,-1] (-1,0] (0,1] (1,2] (2,3] ... (4,5]


Check the data distribution in different ranges:

> summary(c) #or table(c)

c
(-5,-4] (-4,-3] (-3,-2] (-2,-1] (-1,0] (0,1] (1,2] (2,3] (3,4] (4,5]
0 0 2 14 35 38 10 1 0 0


The numbers are divided into 10 levels, the default step is 1. Some levels are empty. Let's try just define the total level number:

> x <- stats::rnorm(100) #random numbers, different every time
> c <- cut(x,breaks=10,dig.lab=2)
> summary(c)

(-2,-1.6] (-1.6,-1.1] (-1.1,-0.69] (-0.69,-0.24] (-0.24,0.21]
5 5 13 20 18
(0.21,0.65] (0.65,1.1] (1.1,1.5] (1.5,2] (2,2.4]
12 14 6 3 4


Label all the levles:

> x <- stats::rnorm(100)
> c <- cut(x,breaks=10,dig.lab=2,labels=1:10)
> summary(c)

1 2 3 4 5 6 7 8 9 10
5 5 13 20 18 12 14 6 3 4


Try again, divide into different ranges (break points):

> x <- stats::rnorm(100)
> c <- cut(x,breaks=c(-2,0,1,2))
> table(c)

c
(-2,0] (0,1] (1,2]
52 32 11

By default, the right=FALSE argument indicates the intervals should be closed on the right and open on the left or vice versa (right=TRUE by default). The default include.lowest=False will not include the lowest value or highest value (for right = FALSE) if it equals the break point.

Let's first generate data. For example, let's flip coins. Each run has 100 flips (size), each flip has 50% chance of head (probability of success), and size * probability is the generated success number of the run, altogether 40 runs.

> x <- rbinom(40,100,0.5)
> x
 [1] 53 53 51 52 58 54 54 53 43 60 56 52 57 55 52 57 52 44 54 44 51 51 45 49 48 57 48
[28] 45 52 51 53 55 46 48 47 45 48 50 46 47
> summary(x)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
43.00   47.75   51.50   50.90   54.00   60.00
> cut(x, breaks=c(43, 48, 52,58, 60))
#the minimal number 43 is <NA> by default include.lowest=F
 [1] (52,58] (52,58] (48,52] (48,52] (52,58] (52,58] (52,58] (52,58] <NA>    (58,60]
[11] (52,58] (48,52] (52,58] (52,58] (48,52] (52,58] (48,52] (43,48] (52,58] (43,48]
[21] (48,52] (48,52] (43,48] (48,52] (43,48] (52,58] (43,48] (43,48] (48,52] (48,52]
[31] (52,58] (52,58] (43,48] (43,48] (43,48] (43,48] (43,48] (48,52] (43,48] (43,48]
Levels: (43,48] (48,52] (52,58] (58,60]
> cut(x, breaks=c(43, 48, 52, 58, 60), include.lowest=T)
 [1] (52,58] (52,58] (48,52] (48,52] (52,58] (52,58] (52,58] (52,58] [43,48] (58,60]
[11] (52,58] (48,52] (52,58] (52,58] (48,52] (52,58] (48,52] [43,48] (52,58] [43,48]
[21] (48,52] (48,52] [43,48] (48,52] [43,48] (52,58] [43,48] [43,48] (48,52] (48,52]
[31] (52,58] (52,58] [43,48] [43,48] [43,48] [43,48] [43,48] (48,52] [43,48] [43,48]
Levels: [43,48] (48,52] (52,58] (58,60]
> cut(x, breaks=c(43, 48, 52,58, 60),right=FALSE)
#the max number 60 is <NA> by right=FALSE
 [1] [52,58) [52,58) [48,52) [52,58) [58,60) [52,58) [52,58) [52,58) [43,48) <NA>
[11] [52,58) [52,58) [52,58) [52,58) [52,58) [52,58) [52,58) [43,48) [52,58) [43,48)
[21] [48,52) [48,52) [43,48) [48,52) [48,52) [52,58) [48,52) [43,48) [52,58) [48,52)
[31] [52,58) [52,58) [43,48) [48,52) [43,48) [43,48) [48,52) [48,52) [43,48) [43,48)
Levels: [43,48) [48,52) [52,58) [58,60)
> summary(cut(x, breaks=c(43, 48, 52, 58, 60), include.lowest=T))
[43,48] (48,52] (52,58] (58,60]
14      11      14       1