Rev. | 9b20561b836912c95b00de5d9d9ae3211d8fc7ee |
---|---|
크기 | 3,449 bytes |
Time | 2020-09-25 05:16:10 |
Author | Lorenzo Isella |
Log Message | Again, I look for keywords in the nomenclature. |
rm(list=ls())
library(tidyverse)
library(openxlsx)
library(janitor)
library(stringr)
library(stringi)
library(stringdist)
## library(furrr)
library(tictoc)
library(magrittr)
source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")
select_closer <- function(x,y){
md <- stringdist(x,y,
method="lv" )
my_min <- which.min(md)
res <- y[my_min]
return(res)
}
########################################################################
########################################################################
########################################################################
########################################################################
name_pos <- 123
hs2_keep <- c("02", "03", "04", "05", "07", "08","12","14","15","22"
,"16", "20","21") %>% sort
## n_cores <- 10
## plan(multicore(workers=return_cores(n_cores)))
df_ini <- read_csv("GI_data_fixed.csv")
df <- df_ini %>%
clean_names()%>%
remove_empty() %>%
mutate(desig_lab=tolower(desig_lab)) %>%
mutate(label_clean=remove_diacritics_convert_utf(desig_lab)) %>%
mutate(label_clean=remove_special_char(label_clean," ")) %>%
mutate(label_clean=remove_short_words(label_clean,3)) %>%
mutate(products_latin=greek_to_latin(desig_lab)) %>%
mutate(products_latin=cyrillic_to_latin(products_latin)) %>%
mutate(products_ascii=latin_to_ascii(products_latin)) %>%
## mutate(products_ascii=tolower(products_ascii)) %>%
mutate(products_ascii=remove_short_words(products_ascii,3))
## t1 <- remove_diacritics_convert_utf(df$desig_lab)
## t2 <- remove_special_char(df$desig_lab)
## t3 <- greek_to_latin(df$desig_lab)
products <- df %>%
select(label_clean) %>%
distinct ## %>%
## filter(!is.na(label_clean))
products_dia <- df %$% products_ascii %>%
unique
## products_all <- bind_cols(products, products_diacritics)
save_excel(products, "GIs_from_agri.xlsx")
save_excel(products_dia, "GIs_from_agri_dia.xlsx")
nom_ini <- read_tsv("nomenclature.tsv")
nom <- nom_ini %>%
mutate(hs2=substrLeft(productid,2)) %>%
filter(hs2 %in% hs2_keep ) %>%
mutate(prod_label2=latin_to_ascii(prod_label)) %>%
mutate(prod_label2=tolower(prod_label2)) %>%
mutate(prod_label2=remove_special_char(prod_label2," ")) %>%
mutate(prod_label2=remove_short_words(prod_label2, 3))
## ll <- stringdist(products_dia[name_pos], nom$prod_label2,
## method="dl")
## mm <- ll %>% which.min
## print("GI name")
## print(products_dia[name_pos])
## print("best match")
## print(nom$prod_label2[mm])
## The approach below is not useful
## test <- stringdistmatrix(products_dia, nom$prod_label2, method="lv")
## pos_min <- rep(NA, nrow(test))
## for (i in seq(nrow(test))){
## pos_min[i] <- which.min(test[i, ])
## i <- i+1
## }
## my_matches <- tibble(product=products_dia, nom_match=nom$prod_label2[pos_min])
## save_excel(my_matches, "matched_lines.xlsx")
key <- find_long_words_unique(products_dia, 3, " ")
tic()
my_matches <- find_exact_matches(key, nom$prod_label2)
matched_data <- nom %>% filter(my_matches)
toc()
saveRDS(matched_data, "matched_data.RDS")
save_excel(matched_data, "matched_data.xlsx")
write_csv(matched_data, "matched_data.csv")
## my_matches <- map_df(products_dia, function(x) stringdist(x, nom$prod_label2,
## method="dl")
## )
print("So far so good")