nlqk.corpus
Natural Language Qu Kit (NLQK) - Quantum Natural Language Processing (QNLP) Library
Corpus package
(C) 2025 by Damir Cavar and NLP Lab
1# coding: utf-8 2 3""" 4Natural Language Qu Kit (NLQK) - Quantum Natural Language Processing (QNLP) Library 5 6Corpus package 7 8(C) 2025 by [Damir Cavar](http://damir.cavar.me/) and [NLP Lab](https://nlp-lab.org/) 9 10""" 11 12 13import requests 14from sys import platform 15from pathlib import Path 16import zipfile 17import pandas as pd 18import nlqk.defaults # nlqk 19# import nlqk #. # as nlqk 20 21 22def download_simlex999() -> bool: 23 """Download the SimLex-999 dataset and extract it to the appropriate folder. 24 Returns: 25 bool: True if the download and extraction were successful, False otherwise. 26 """ 27 # make sure there is a corpora subfolder in the data folder 28 data_folder = nlqk.get_data_folder() / nlqk.defaults.DATA_FOLDER_CORPORA 29 if not data_folder.exists(): 30 data_folder.mkdir(parents=True) 31 simlex_file = data_folder / nlqk.defaults.SIMLEX_999_ZIP_FILE 32 if simlex_file.exists(): 33 if data_folder.exists(): 34 return True 35 with zipfile.ZipFile(simlex_file, mode='r') as zip_ref: 36 zip_ref.extractall(data_folder) 37 return True 38 try: 39 response = requests.get(nlqk.defaults.SIMLEX_999_URL, stream=True) 40 response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx) 41 with open(simlex_file, mode='wb') as f: 42 for chunk in response.iter_content(chunk_size=8192): 43 f.write(chunk) 44 with zipfile.ZipFile(simlex_file, mode='r') as zip_ref: 45 zip_ref.extractall(data_folder) 46 # return True 47 except requests.exceptions.RequestException as e: 48 print(f"Error downloading file: {e}") 49 return False 50 # unpack the nouns 51 simlex_main_file = data_folder / nlqk.defaults.SIMLEX_999_FOLDER / nlqk.defaults.SIMLEX_999_FILE 52 try: 53 if simlex_main_file.exists(): 54 with open(simlex_main_file, mode='r', encoding='utf-8') as f: 55 # Process the file if needed 56 df = pd.read_csv(f, sep='\t', header=0, encoding='utf-8') 57 df_noun_pairs = df.loc[df['POS'] == 'N'] 58 df_noun_pairs.to_csv(data_folder / nlqk.defaults.SIMLEX_999_FOLDER / 'nouns_data.txt', sep='\t', index=False, encoding='utf-8') 59 string_values_list = list(set(df_noun_pairs['word1'].astype(str).tolist() + df_noun_pairs['word2'].astype(str).tolist())) 60 string_values_list.sort() 61 with open(data_folder / nlqk.defaults.SIMLEX_999_FOLDER / 'nouns.txt', mode='w', encoding='utf-8') as nouns_file: 62 for value in string_values_list: 63 nouns_file.write(value + '\n') 64 return True 65 else: 66 print(f"SimLex-999 file not found: {simlex_main_file}") 67 return False 68 except IOError as e: 69 print(f"Error accessing file: {e}") 70 return False 71 return True
def
download_simlex999() -> bool:
23def download_simlex999() -> bool: 24 """Download the SimLex-999 dataset and extract it to the appropriate folder. 25 Returns: 26 bool: True if the download and extraction were successful, False otherwise. 27 """ 28 # make sure there is a corpora subfolder in the data folder 29 data_folder = nlqk.get_data_folder() / nlqk.defaults.DATA_FOLDER_CORPORA 30 if not data_folder.exists(): 31 data_folder.mkdir(parents=True) 32 simlex_file = data_folder / nlqk.defaults.SIMLEX_999_ZIP_FILE 33 if simlex_file.exists(): 34 if data_folder.exists(): 35 return True 36 with zipfile.ZipFile(simlex_file, mode='r') as zip_ref: 37 zip_ref.extractall(data_folder) 38 return True 39 try: 40 response = requests.get(nlqk.defaults.SIMLEX_999_URL, stream=True) 41 response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx) 42 with open(simlex_file, mode='wb') as f: 43 for chunk in response.iter_content(chunk_size=8192): 44 f.write(chunk) 45 with zipfile.ZipFile(simlex_file, mode='r') as zip_ref: 46 zip_ref.extractall(data_folder) 47 # return True 48 except requests.exceptions.RequestException as e: 49 print(f"Error downloading file: {e}") 50 return False 51 # unpack the nouns 52 simlex_main_file = data_folder / nlqk.defaults.SIMLEX_999_FOLDER / nlqk.defaults.SIMLEX_999_FILE 53 try: 54 if simlex_main_file.exists(): 55 with open(simlex_main_file, mode='r', encoding='utf-8') as f: 56 # Process the file if needed 57 df = pd.read_csv(f, sep='\t', header=0, encoding='utf-8') 58 df_noun_pairs = df.loc[df['POS'] == 'N'] 59 df_noun_pairs.to_csv(data_folder / nlqk.defaults.SIMLEX_999_FOLDER / 'nouns_data.txt', sep='\t', index=False, encoding='utf-8') 60 string_values_list = list(set(df_noun_pairs['word1'].astype(str).tolist() + df_noun_pairs['word2'].astype(str).tolist())) 61 string_values_list.sort() 62 with open(data_folder / nlqk.defaults.SIMLEX_999_FOLDER / 'nouns.txt', mode='w', encoding='utf-8') as nouns_file: 63 for value in string_values_list: 64 nouns_file.write(value + '\n') 65 return True 66 else: 67 print(f"SimLex-999 file not found: {simlex_main_file}") 68 return False 69 except IOError as e: 70 print(f"Error accessing file: {e}") 71 return False 72 return True
Download the SimLex-999 dataset and extract it to the appropriate folder. Returns: bool: True if the download and extraction were successful, False otherwise.