• R/O
  • SSH

Tags
No Tags

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

File Info

Rev. 9b20561b836912c95b00de5d9d9ae3211d8fc7ee
크기 3,449 bytes
Time 2020-09-25 05:16:10
Author Lorenzo Isella
Log Message

Again, I look for keywords in the nomenclature.

Content

rm(list=ls())

library(tidyverse)
library(openxlsx)
library(janitor)

library(stringr)
library(stringi)
library(stringdist)
## library(furrr)
library(tictoc)

library(magrittr)

source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")



select_closer <- function(x,y){


    
md <- stringdist(x,y,
                 method="lv" )

    my_min <- which.min(md)

    res <- y[my_min]
    return(res)
    
}


########################################################################
########################################################################
########################################################################
########################################################################

name_pos <- 123


hs2_keep <- c("02", "03", "04", "05", "07", "08","12","14","15","22"
,"16", "20","21") %>% sort


## n_cores <- 10




## plan(multicore(workers=return_cores(n_cores)))






df_ini <- read_csv("GI_data_fixed.csv")


df <- df_ini %>%
    clean_names()%>%
    remove_empty() %>%
    mutate(desig_lab=tolower(desig_lab)) %>% 
    mutate(label_clean=remove_diacritics_convert_utf(desig_lab))  %>%
    mutate(label_clean=remove_special_char(label_clean," ")) %>%
    mutate(label_clean=remove_short_words(label_clean,3)) %>% 
    mutate(products_latin=greek_to_latin(desig_lab)) %>%
    mutate(products_latin=cyrillic_to_latin(products_latin)) %>%
    mutate(products_ascii=latin_to_ascii(products_latin)) %>%
    ## mutate(products_ascii=tolower(products_ascii)) %>%
    mutate(products_ascii=remove_short_words(products_ascii,3))

## t1 <- remove_diacritics_convert_utf(df$desig_lab)
## t2 <- remove_special_char(df$desig_lab)

## t3 <- greek_to_latin(df$desig_lab)


products <- df %>%
    select(label_clean) %>%
    distinct ## %>%
    ## filter(!is.na(label_clean))


products_dia <- df %$% products_ascii %>%
    unique



## products_all <- bind_cols(products, products_diacritics)
    

save_excel(products, "GIs_from_agri.xlsx")


save_excel(products_dia, "GIs_from_agri_dia.xlsx")


nom_ini <- read_tsv("nomenclature.tsv")

nom <- nom_ini %>%
    mutate(hs2=substrLeft(productid,2)) %>%
    filter(hs2  %in% hs2_keep ) %>% 
    mutate(prod_label2=latin_to_ascii(prod_label)) %>%
    mutate(prod_label2=tolower(prod_label2)) %>%
    mutate(prod_label2=remove_special_char(prod_label2," ")) %>%
    mutate(prod_label2=remove_short_words(prod_label2, 3))





## ll <- stringdist(products_dia[name_pos], nom$prod_label2,
##                  method="dl")



## mm <- ll %>% which.min

## print("GI name")
## print(products_dia[name_pos])

## print("best match")
## print(nom$prod_label2[mm])


## The approach below is not useful


## test <- stringdistmatrix(products_dia, nom$prod_label2, method="lv")

## pos_min <- rep(NA, nrow(test))

## for (i in seq(nrow(test))){

## pos_min[i] <- which.min(test[i, ])

##     i <- i+1
    
## }



## my_matches <- tibble(product=products_dia, nom_match=nom$prod_label2[pos_min])

## save_excel(my_matches, "matched_lines.xlsx")



key <- find_long_words_unique(products_dia, 3, " ")

tic()

my_matches <- find_exact_matches(key, nom$prod_label2)

matched_data <-  nom %>% filter(my_matches)

toc()


saveRDS(matched_data, "matched_data.RDS")

save_excel(matched_data, "matched_data.xlsx")

write_csv(matched_data, "matched_data.csv")



## my_matches <- map_df(products_dia, function(x) stringdist(x, nom$prod_label2,
##                  method="dl")
##   )




print("So far so good")