textmodel_affinity implements the maximum likelihood supervised text scaling method described in Perry and Benoit (2017).

textmodel_affinity(x, y, exclude = NULL, smooth = 0.5, ref_smooth = 0.5,
  verbose = TRUE)

Arguments

x

the dfm or bootstrap_dfm object on which the model will be fit. Does not need to contain only the training documents, since the index of these will be matched automatically.

y

vector of training classes/scores associated with each document identified in data

exclude

a set of words to exclude from the model

smooth

a smoothing parameter for class affinities; defaults to 0.5 (Jeffreys prior). A plausible alternative would be 1.0 (Laplace prior).

ref_smooth

a smoothing parameter for token distributions; defaults to 0.5

verbose

logical; if TRUE print diagnostic information during fitting.

References

Perry, Patrick O. and Kenneth Benoit. (2017) "Scaling Text with the Class Affinity Model". arXiv:1710.08963 [stat.ML].

See also

predict.textmodel_affinity for methods of applying a fitted textmodel_affinity model. object to predict quantities from (other) documents.

Examples

(af <- textmodel_affinity(data_dfm_lbgexample, y = c("L", NA, NA, NA, "R", NA)))
#> Call: #> textmodel_affinity.dfm(x = data_dfm_lbgexample, y = c("L", NA, #> NA, NA, "R", NA)) #> #> Training documents per class:L: 1, R: 1; total training features: 37
predict(af)
#> $coefficients #> L R #> R1 0.9994964426 0.0005035574 #> R2 0.9994114959 0.0005885041 #> R3 0.5000000000 0.5000000000 #> R4 0.0005885041 0.9994114959 #> R5 0.0005035574 0.9994964426 #> V1 0.9986670961 0.0013329039 #> #> $se #> L R #> R1 0.0007119597 0.0007119597 #> R2 0.0008319957 0.0008319957 #> R3 0.0273371117 0.0273371117 #> R4 0.0008319957 0.0008319957 #> R5 0.0007119597 0.0007119597 #> V1 0.0018655335 0.0018655335 #> #> $cov #> , , R1 #> #> L R #> L 5.068867e-07 -5.068867e-07 #> R -5.068867e-07 5.068867e-07 #> #> , , R2 #> #> L R #> L 6.922169e-07 -6.922169e-07 #> R -6.922169e-07 6.922169e-07 #> #> , , R3 #> #> L R #> L 0.0007473177 -0.0007473177 #> R -0.0007473177 0.0007473177 #> #> , , R4 #> #> L R #> L 6.922169e-07 -6.922169e-07 #> R -6.922169e-07 6.922169e-07 #> #> , , R5 #> #> L R #> L 5.068867e-07 -5.068867e-07 #> R -5.068867e-07 5.068867e-07 #> #> , , V1 #> #> L R #> L 3.480215e-06 -3.480215e-06 #> R -3.480215e-06 3.480215e-06 #> #> #> $smooth #> [1] 0.5 0.5 #> #> $newdata #> Document-feature matrix of: 6 documents, 37 features (54.1% sparse). #> #> $train #> [1] TRUE FALSE FALSE FALSE TRUE FALSE #> #> $level #> [1] 0.95 #> #> $p #> 37 x 2 sparse Matrix of class "dgCMatrix" #> docs #> features L R #> a 0.0024582104 0.0004916421 #> b 0.0034414946 0.0004916421 #> c 0.0103244838 0.0004916421 #> d 0.0221238938 0.0004916421 #> e 0.0447394297 0.0004916421 #> f 0.0771878073 0.0004916421 #> g 0.1135693215 0.0004916421 #> h 0.1440511308 0.0004916421 #> i 0.1558505408 0.0004916421 #> j 0.1440511308 0.0004916421 #> k 0.1135693215 0.0004916421 #> l 0.0771878073 0.0004916421 #> m 0.0447394297 0.0004916421 #> n 0.0221238938 0.0004916421 #> o 0.0103244838 0.0004916421 #> p 0.0034414946 0.0004916421 #> q 0.0024582104 0.0004916421 #> r . . #> s . . #> t . . #> u 0.0004916421 0.0024582104 #> v 0.0004916421 0.0034414946 #> w 0.0004916421 0.0103244838 #> x 0.0004916421 0.0221238938 #> y 0.0004916421 0.0447394297 #> z 0.0004916421 0.0771878073 #> za 0.0004916421 0.1135693215 #> zb 0.0004916421 0.1440511308 #> zc 0.0004916421 0.1558505408 #> zd 0.0004916421 0.1440511308 #> ze 0.0004916421 0.1135693215 #> zf 0.0004916421 0.0771878073 #> zg 0.0004916421 0.0447394297 #> zh 0.0004916421 0.0221238938 #> zi 0.0004916421 0.0103244838 #> zj 0.0004916421 0.0034414946 #> zk 0.0004916421 0.0024582104 #> #> $support #> a b c d e f g h i j k l m #> TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE #> n o p q r s t u v w x y z #> TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE #> za zb zc zd ze zf zg zh zi zj zk #> TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE #>
predict(af, newdata = data_dfm_lbgexample[6, ])
#> $coefficients #> L R #> V1 0.9986671 0.001332904 #> #> $se #> L R #> V1 0.001865533 0.001865533 #> #> $cov #> , , V1 #> #> L R #> L 3.480215e-06 -3.480215e-06 #> R -3.480215e-06 3.480215e-06 #> #> #> $smooth #> [1] 0.5 0.5 #> #> $newdata #> Document-feature matrix of: 1 document, 37 features (54.1% sparse). #> #> $train #> [1] FALSE #> #> $level #> [1] 0.95 #> #> $p #> 37 x 2 sparse Matrix of class "dgCMatrix" #> docs #> features L R #> a 0.0024582104 0.0004916421 #> b 0.0034414946 0.0004916421 #> c 0.0103244838 0.0004916421 #> d 0.0221238938 0.0004916421 #> e 0.0447394297 0.0004916421 #> f 0.0771878073 0.0004916421 #> g 0.1135693215 0.0004916421 #> h 0.1440511308 0.0004916421 #> i 0.1558505408 0.0004916421 #> j 0.1440511308 0.0004916421 #> k 0.1135693215 0.0004916421 #> l 0.0771878073 0.0004916421 #> m 0.0447394297 0.0004916421 #> n 0.0221238938 0.0004916421 #> o 0.0103244838 0.0004916421 #> p 0.0034414946 0.0004916421 #> q 0.0024582104 0.0004916421 #> r . . #> s . . #> t . . #> u 0.0004916421 0.0024582104 #> v 0.0004916421 0.0034414946 #> w 0.0004916421 0.0103244838 #> x 0.0004916421 0.0221238938 #> y 0.0004916421 0.0447394297 #> z 0.0004916421 0.0771878073 #> za 0.0004916421 0.1135693215 #> zb 0.0004916421 0.1440511308 #> zc 0.0004916421 0.1558505408 #> zd 0.0004916421 0.1440511308 #> ze 0.0004916421 0.1135693215 #> zf 0.0004916421 0.0771878073 #> zg 0.0004916421 0.0447394297 #> zh 0.0004916421 0.0221238938 #> zi 0.0004916421 0.0103244838 #> zj 0.0004916421 0.0034414946 #> zk 0.0004916421 0.0024582104 #> #> $support #> a b c d e f g h i j k l m #> TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE #> n o p q r s t u v w x y z #> TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE #> za zb zc zd ze zf zg zh zi zj zk #> TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE #>
# NOT RUN { # compute bootstrapped SEs bs_dfm <- bootstrap_dfm(data_corpus_dailnoconf1991, n = 10, remove_punct = TRUE) textmodel_affinity(bs_dfm, y = c("Govt", "Opp", "Opp", rep(NA, 55))) # }