nlqk.corpus

Natural Language Qu Kit (NLQK) - Quantum Natural Language Processing (QNLP) Library

Corpus package

(C) 2025 by Damir Cavar and NLP Lab

 1# coding: utf-8
 2
 3"""
 4Natural Language Qu Kit (NLQK) - Quantum Natural Language Processing (QNLP) Library
 5
 6Corpus package
 7
 8(C) 2025 by [Damir Cavar](http://damir.cavar.me/) and [NLP Lab](https://nlp-lab.org/)
 9
10"""
11
12
13import requests
14from sys import platform
15from pathlib import Path
16import zipfile
17import pandas as pd
18import nlqk.defaults # nlqk
19# import nlqk #. # as nlqk
20
21
22def download_simlex999() -> bool:
23    """Download the SimLex-999 dataset and extract it to the appropriate folder.
24    Returns:
25        bool: True if the download and extraction were successful, False otherwise.
26    """
27    # make sure there is a corpora subfolder in the data folder
28    data_folder = nlqk.get_data_folder() / nlqk.defaults.DATA_FOLDER_CORPORA
29    if not data_folder.exists():
30        data_folder.mkdir(parents=True)
31    simlex_file = data_folder / nlqk.defaults.SIMLEX_999_ZIP_FILE
32    if simlex_file.exists():
33        if data_folder.exists():
34            return True
35        with zipfile.ZipFile(simlex_file, mode='r') as zip_ref:
36            zip_ref.extractall(data_folder)
37        return True
38    try:
39        response = requests.get(nlqk.defaults.SIMLEX_999_URL, stream=True)
40        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
41        with open(simlex_file, mode='wb') as f:
42            for chunk in response.iter_content(chunk_size=8192):
43                f.write(chunk)
44        with zipfile.ZipFile(simlex_file, mode='r') as zip_ref:
45            zip_ref.extractall(data_folder)
46        # return True
47    except requests.exceptions.RequestException as e:
48        print(f"Error downloading file: {e}")
49        return False
50    # unpack the nouns
51    simlex_main_file = data_folder / nlqk.defaults.SIMLEX_999_FOLDER / nlqk.defaults.SIMLEX_999_FILE
52    try:
53        if simlex_main_file.exists():
54            with open(simlex_main_file, mode='r', encoding='utf-8') as f:
55                # Process the file if needed
56                df = pd.read_csv(f, sep='\t', header=0, encoding='utf-8')
57                df_noun_pairs = df.loc[df['POS'] == 'N']
58                df_noun_pairs.to_csv(data_folder / nlqk.defaults.SIMLEX_999_FOLDER / 'nouns_data.txt', sep='\t', index=False, encoding='utf-8')
59                string_values_list = list(set(df_noun_pairs['word1'].astype(str).tolist() + df_noun_pairs['word2'].astype(str).tolist()))
60                string_values_list.sort()
61                with open(data_folder / nlqk.defaults.SIMLEX_999_FOLDER / 'nouns.txt', mode='w', encoding='utf-8') as nouns_file:
62                    for value in string_values_list:
63                        nouns_file.write(value + '\n')
64                return True
65        else:
66            print(f"SimLex-999 file not found: {simlex_main_file}")
67            return False
68    except IOError as e:
69        print(f"Error accessing file: {e}")
70        return False
71    return True
def download_simlex999() -> bool:
23def download_simlex999() -> bool:
24    """Download the SimLex-999 dataset and extract it to the appropriate folder.
25    Returns:
26        bool: True if the download and extraction were successful, False otherwise.
27    """
28    # make sure there is a corpora subfolder in the data folder
29    data_folder = nlqk.get_data_folder() / nlqk.defaults.DATA_FOLDER_CORPORA
30    if not data_folder.exists():
31        data_folder.mkdir(parents=True)
32    simlex_file = data_folder / nlqk.defaults.SIMLEX_999_ZIP_FILE
33    if simlex_file.exists():
34        if data_folder.exists():
35            return True
36        with zipfile.ZipFile(simlex_file, mode='r') as zip_ref:
37            zip_ref.extractall(data_folder)
38        return True
39    try:
40        response = requests.get(nlqk.defaults.SIMLEX_999_URL, stream=True)
41        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
42        with open(simlex_file, mode='wb') as f:
43            for chunk in response.iter_content(chunk_size=8192):
44                f.write(chunk)
45        with zipfile.ZipFile(simlex_file, mode='r') as zip_ref:
46            zip_ref.extractall(data_folder)
47        # return True
48    except requests.exceptions.RequestException as e:
49        print(f"Error downloading file: {e}")
50        return False
51    # unpack the nouns
52    simlex_main_file = data_folder / nlqk.defaults.SIMLEX_999_FOLDER / nlqk.defaults.SIMLEX_999_FILE
53    try:
54        if simlex_main_file.exists():
55            with open(simlex_main_file, mode='r', encoding='utf-8') as f:
56                # Process the file if needed
57                df = pd.read_csv(f, sep='\t', header=0, encoding='utf-8')
58                df_noun_pairs = df.loc[df['POS'] == 'N']
59                df_noun_pairs.to_csv(data_folder / nlqk.defaults.SIMLEX_999_FOLDER / 'nouns_data.txt', sep='\t', index=False, encoding='utf-8')
60                string_values_list = list(set(df_noun_pairs['word1'].astype(str).tolist() + df_noun_pairs['word2'].astype(str).tolist()))
61                string_values_list.sort()
62                with open(data_folder / nlqk.defaults.SIMLEX_999_FOLDER / 'nouns.txt', mode='w', encoding='utf-8') as nouns_file:
63                    for value in string_values_list:
64                        nouns_file.write(value + '\n')
65                return True
66        else:
67            print(f"SimLex-999 file not found: {simlex_main_file}")
68            return False
69    except IOError as e:
70        print(f"Error accessing file: {e}")
71        return False
72    return True

Download the SimLex-999 dataset and extract it to the appropriate folder. Returns: bool: True if the download and extraction were successful, False otherwise.