将一个数据集与另一个数据集中最近的邻域进行匹配邻域、数据、最近

2023-09-03 13:57:21 作者:偷走你的满目星河

我正在尝试匹配基于虚拟对象的MATCHIT包。

如果哑元的值为True,则使用两个匹配变量匹配比较公司;如果为False,则仅匹配其中一个变量。

论AI应用的落地方向,读懂老板心情,来年升职加薪

我有以下代码,但不知道如何将上述情况包含到函数中。我无法运行两个匹配过程来分隔Dummy=True和Dummy=False时的数据,因为这可能会导致两个匹配过程中出现相同的匹配。

matchman <- matchit(treated ~ laggedsize + lagged.env_score
                   , data = df,
                    distance = "mahalanobis",
                    exact = ~ year + sic + country)

WHERElagged.env_score应仅在dummy=TRUE、IF FALSE仅使用laggedsize匹配时才应用于函数

df:

df <- structure(list(isin = c("JP3304200003", "JP3890350006", "GB00B10RZP78", 
"GB0031348658", "GB00BH4HKS39", "JP3502200003", "GB0005405286", 
"JP3276400003", "DE000CBK1001", "DE0005140008", "JP3899600005", 
"GB00B7T77214", "JP3200450009", "DE0008430026", "JP3199000005", 
"JP3582600007", "DE000BASF111", "DE0007100000", "JP3505000004", 
"JP3210200006", "JP3258000003", "JP3569200003", "JP3388600003", 
"DE0005194062", "JP3900000005", "JP3753000003", "JP3386450005", 
"JP3429800000", "JP3111200006", "JP3215800008", "JP3143600009", 
"JP3526600006", "JP3246400000", "JP3893200000", "JP3877600001", 
"JP3870400003", "JP3902000003", "JP3362700001", "DE000ENAG999", 
"JP3309000002", "JP3605400005", "JP3932000007", "JP3573000001", 
"JP3190000004", "DE0007037129", "JP3358800005", "JP3407000003", 
"GB00BNR4T868", "JP3789000001", "JP3919800007"), sic = c(35, 
60, 28, 60, 48, 62, 60, 60, 60, 60, 65, 60, 61, 63, 61, 65, 28, 
37, 15, 16, 20, 15, 61, 51, 35, 47, 29, 45, 28, 28, 50, 49, 49, 
65, 50, 53, 42, 44, 99, 16, 49, 35, 49, 16, 49, 15, 42, 49, 35, 
36), treated = c(1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 
1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 
1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1), country = c("JP", 
"JP", "GB", "GB", "GB", "JP", "GB", "JP", "DE", "DE", "JP", "GB", 
"JP", "DE", "JP", "JP", "DE", "DE", "JP", "JP", "JP", "JP", "JP", 
"DE", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", "JP", 
"JP", "JP", "JP", "JP", "DE", "JP", "JP", "JP", "JP", "JP", "DE", 
"JP", "JP", "GB", "JP", "JP"), greenbond = c(0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0), laggedsize = c(9.64948242832745, 13.7249455275361, 10.6253922435153, 
13.7249455275361, 12.0806213380608, 11.8193300010735, 13.7249455275361, 
10.7963674506149, 13.5335441201084, 13.7249455275361, 10.4483597497766, 
13.7249455275361, 11.0233960616849, 12.3731028837552, 10.4353256541104, 
9.05291497486036, 10.9919316532886, 11.8191593828165, 9.62774359721499, 
9.56306369398111, 10.1019759184986, 9.03090594532814, 10.0165213601746, 
8.08741008304174, 10.4269602761474, 9.7687281946708, 10.3951116474749, 
9.59710464541169, 9.29102186770426, 8.13972317845105, 10.6775497846209, 
10.6447225064563, 10.3767716217502, 10.2881661005029, 10.5001531182996, 
8.56808459082482, 7.90326540259056, 9.59830182961864, 11.9374151199707, 
7.75367768887085, 10.3427480263202, 7.55675119529438, 9.58731390722022, 
9.44117363933369, 11.4411823865573, 9.42588383476401, 7.53303741646037, 
7.12522098177824, 7.9252819694109, 7.40008821333257), laggedenv.score = c(NA, 
53.3421510847241, 90.737128915381, 82.9224487392979, 62.7432787102952, 
68.7604808706758, 87.9530360847733, 46.6230810769112, 93.5434730659094, 
88.2365033736853, 88.5789348754909, 80.5955107045558, 60.3676789172653, 
91.7607218845976, NA, 8.90804597701149, 87.0323650463109, 90.979229822056, 
49.3650793650793, 79.2698412698412, 71.6268004115226, NA, NA, 
NA, 88.6209695476936, 54.1014492753623, NA, 70.7252727009565, 
67.5843026720795, 57.9365389874003, 60.5339105339105, 90.46938726037, 
NA, 92.4138290679441, 47.8715728715728, NA, 36.1795462601914, 
70.8840579710145, 58.0620532813515, NA, 41.9395107079791, 66.8071163889808, 
87.885006010861, 66.2893188283065, 94.3875893437297, 48.4761904761904, 
NA, 47.364849321371, NA, NA), dummy = c(TRUE, FALSE, FALSE, 
TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, 
FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, 
TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, 
FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, 
TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE)), row.names = c(NA, 
-50L), groups = structure(list(isin = c("DE0005140008", "DE0005194062", 
"DE0007037129", "DE0007100000", "DE0008430026", "DE000BASF111", 
"DE000CBK1001", "DE000ENAG999", "GB0005405286", "GB0031348658", 
"GB00B10RZP78", "GB00B7T77214", "GB00BH4HKS39", "GB00BNR4T868", 
"JP3111200006", "JP3143600009", "JP3190000004", "JP3199000005", 
"JP3200450009", "JP3210200006", "JP3215800008", "JP3246400000", 
"JP3258000003", "JP3276400003", "JP3304200003", "JP3309000002", 
"JP3358800005", "JP3362700001", "JP3386450005", "JP3388600003", 
"JP3407000003", "JP3429800000", "JP3502200003", "JP3505000004", 
"JP3526600006", "JP3569200003", "JP3573000001", "JP3582600007", 
"JP3605400005", "JP3753000003", "JP3789000001", "JP3870400003", 
"JP3877600001", "JP3890350006", "JP3893200000", "JP3899600005", 
"JP3900000005", "JP3902000003", "JP3919800007", "JP3932000007"
), .rows = structure(list(10L, 24L, 45L, 18L, 14L, 17L, 9L, 39L, 
    7L, 4L, 3L, 12L, 5L, 48L, 29L, 31L, 44L, 15L, 13L, 20L, 30L, 
    33L, 21L, 8L, 1L, 40L, 46L, 38L, 27L, 23L, 47L, 28L, 6L, 
    19L, 32L, 22L, 43L, 16L, 41L, 26L, 49L, 36L, 35L, 2L, 34L, 
    11L, 25L, 37L, 50L, 42L), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), row.names = c(NA, 50L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

推荐答案

您实际上可以在对matchit()的两个单独调用中运行它。您只需确保不会重用第一个调用中使用的第二个调用中的单元。

matchman1 <- matchit(treated ~ laggedsize + lagged.env_score, 
                    data = df[df$dummy,],
                    distance = "mahalanobis",
                    exact = ~ year + sic + country)

matchman2 <- matchit(treated ~ laggedsize, 
                    data = df[matchman1$weights == 0,],
                    distance = "mahalanobis",
                    exact = ~ year + sic + country)

第一个调用只匹配df$dummy == TRUE的单位。任何不匹配的单位在输出中的权重都将为0。第二个调用与其余单元匹配,但仅在laggedsize上匹配。您可以单独评估每个组的平衡,然后使用rbind()

合并它们
m.data1 <- match.data(matchman1)
m.data2 <- match.data(matchman2)

m.data <- rbind(m.data1, m.data2)