diff --git a/README.md b/README.md index 7b35f2d..2440b01 100644 --- a/README.md +++ b/README.md @@ -12,32 +12,9 @@ - High Precision / Recall - Worldwide Names -## Installation -``` -npm install name-dataset -``` - -The NPM package is not available yet, to be published shortly. - -## Usage - -```js -const Names = require('name-dataset') -``` - - - -## How reliable is it? - -Well, it depends if you are looking for a high recall or a high precision. For example, the word Rose can be either a name or a noun. If we include it in the list, then we increase the precision but we decrease the recall. And vice versa, if it's not in the list. The library checks that the word starts with a capital letter. In our case, we emphasize more on precision. So I would say the best use case here is to check whether it's a name or not based on a prior knowledge that the customer has submitted a name. - -Here is an example on a (old) text: [ALI BABA AND THE FORTY THIEVES](http://textfiles.com/stories/ab40thv.txt). - - - ## Dataset Generation -[generate.sh](name-dataset/blob/master/generation/generate.sh) +[generate.sh](name-dataset/blob/master/generate/index.sh) - [listofrandomnames.com](http://listofrandomnames.com/index.cfm?generated) - [sajari.com 5000 Names around the Globe](https://www.sajari.com/public-data) diff --git a/assets/img_1.png b/assets/img_1.png deleted file mode 100644 index 74773c9..0000000 Binary files a/assets/img_1.png and /dev/null differ diff --git a/assets/img_2.png b/assets/img_2.png deleted file mode 100644 index 012506f..0000000 Binary files a/assets/img_2.png and /dev/null differ diff --git a/generation/diff.py b/generate/diff.py similarity index 100% rename from generation/diff.py rename to generate/diff.py diff --git a/generation/generate.sh b/generate/index.sh similarity index 100% rename from generation/generate.sh rename to generate/index.sh diff --git a/generation/load_and_compile_lib.sh b/generate/load_and_compile_lib.sh similarity index 100% rename from generation/load_and_compile_lib.sh rename to generate/load_and_compile_lib.sh diff --git a/generation/scripts/MatthiasWinkelmann_firstname-database/transform.sh b/generate/scripts/MatthiasWinkelmann_firstname-database/transform.sh similarity index 100% rename from generation/scripts/MatthiasWinkelmann_firstname-database/transform.sh rename to generate/scripts/MatthiasWinkelmann_firstname-database/transform.sh diff --git a/generation/scripts/OpenGenderTracking_globalnamedata/transform.sh b/generate/scripts/OpenGenderTracking_globalnamedata/transform.sh similarity index 100% rename from generation/scripts/OpenGenderTracking_globalnamedata/transform.sh rename to generate/scripts/OpenGenderTracking_globalnamedata/transform.sh diff --git a/generation/scripts/cs.cmu.edu.ai-repository/transform.sh b/generate/scripts/cs.cmu.edu.ai-repository/transform.sh similarity index 100% rename from generation/scripts/cs.cmu.edu.ai-repository/transform.sh rename to generate/scripts/cs.cmu.edu.ai-repository/transform.sh diff --git a/generation/scripts/datasets_imdb/transform.sh b/generate/scripts/datasets_imdb/transform.sh similarity index 100% rename from generation/scripts/datasets_imdb/transform.sh rename to generate/scripts/datasets_imdb/transform.sh diff --git a/generation/scripts/github_com_dominictarr_random-name/transform.sh b/generate/scripts/github_com_dominictarr_random-name/transform.sh similarity index 100% rename from generation/scripts/github_com_dominictarr_random-name/transform.sh rename to generate/scripts/github_com_dominictarr_random-name/transform.sh diff --git a/generation/scripts/github_com_smashew_NameDatabases/transform.sh b/generate/scripts/github_com_smashew_NameDatabases/transform.sh similarity index 100% rename from generation/scripts/github_com_smashew_NameDatabases/transform.sh rename to generate/scripts/github_com_smashew_NameDatabases/transform.sh diff --git a/generation/scripts/github_hadley_data-baby-names/transform.sh b/generate/scripts/github_hadley_data-baby-names/transform.sh similarity index 100% rename from generation/scripts/github_hadley_data-baby-names/transform.sh rename to generate/scripts/github_hadley_data-baby-names/transform.sh diff --git a/generation/scripts/httpswwwsajaricompublic-data/transform.py b/generate/scripts/httpswwwsajaricompublic-data/transform.py similarity index 100% rename from generation/scripts/httpswwwsajaricompublic-data/transform.py rename to generate/scripts/httpswwwsajaricompublic-data/transform.py diff --git a/generation/scripts/httpswwwsajaricompublic-data/transform.sh b/generate/scripts/httpswwwsajaricompublic-data/transform.sh similarity index 100% rename from generation/scripts/httpswwwsajaricompublic-data/transform.sh rename to generate/scripts/httpswwwsajaricompublic-data/transform.sh diff --git a/generation/scripts/mbejda.github.io/transform.sh b/generate/scripts/mbejda.github.io/transform.sh similarity index 100% rename from generation/scripts/mbejda.github.io/transform.sh rename to generate/scripts/mbejda.github.io/transform.sh diff --git a/generation/scripts/most-popular-names-for-the-past-100-years/transform.sh b/generate/scripts/most-popular-names-for-the-past-100-years/transform.sh similarity index 100% rename from generation/scripts/most-popular-names-for-the-past-100-years/transform.sh rename to generate/scripts/most-popular-names-for-the-past-100-years/transform.sh diff --git a/generation/scripts/ssa.gov_oact_babynames_limits/transform.sh b/generate/scripts/ssa.gov_oact_babynames_limits/transform.sh similarity index 100% rename from generation/scripts/ssa.gov_oact_babynames_limits/transform.sh rename to generate/scripts/ssa.gov_oact_babynames_limits/transform.sh diff --git a/generation/scripts/www.nrscotland.gov.uk/transform.sh b/generate/scripts/www.nrscotland.gov.uk/transform.sh similarity index 100% rename from generation/scripts/www.nrscotland.gov.uk/transform.sh rename to generate/scripts/www.nrscotland.gov.uk/transform.sh diff --git a/generation/scripts/www2.census.gov.1990surnames/transform.sh b/generate/scripts/www2.census.gov.1990surnames/transform.sh similarity index 100% rename from generation/scripts/www2.census.gov.1990surnames/transform.sh rename to generate/scripts/www2.census.gov.1990surnames/transform.sh diff --git a/generation/scripts/yorkshiretwist/transform.py b/generate/scripts/yorkshiretwist/transform.py similarity index 100% rename from generation/scripts/yorkshiretwist/transform.py rename to generate/scripts/yorkshiretwist/transform.py diff --git a/generation/scripts/yorkshiretwist/transform.sh b/generate/scripts/yorkshiretwist/transform.sh similarity index 100% rename from generation/scripts/yorkshiretwist/transform.sh rename to generate/scripts/yorkshiretwist/transform.sh diff --git a/names_dataset/__init__.py b/names_dataset/__init__.py deleted file mode 100644 index 5a85879..0000000 --- a/names_dataset/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from names_dataset.query import NameDataset - -__version__ = '1.9.1' diff --git a/names_dataset/query.py b/names_dataset/query.py deleted file mode 100644 index 897b735..0000000 --- a/names_dataset/query.py +++ /dev/null @@ -1,52 +0,0 @@ -import os - - -def _query(search_set, key, use_upper_case): - if use_upper_case and key.title() != key: - return False - return key.strip().lower() in search_set - - -class NameDataset: - FIRST_NAME_SEARCH = 0 - LAST_NAME_SEARCH = 1 - - def __init__(self): - first_names_filename = os.path.join(os.path.dirname(__file__), 'first_names.all.txt') - with open(first_names_filename, 'r', errors='ignore', encoding='utf8') as r: - self.first_names = set(r.read().strip().split('\n')) - last_names_filename = os.path.join(os.path.dirname(__file__), 'last_names.all.txt') - with open(last_names_filename, 'r', errors='ignore', encoding='utf8') as r: - self.last_names = set(r.read().strip().split('\n')) - - def search_first_name(self, first_name, use_upper_case=False): - return _query(self.first_names, first_name, use_upper_case) - - def search_last_name(self, last_name, use_upper_case=False): - return _query(self.last_names, last_name, use_upper_case) - - -if __name__ == '__main__': - import sys - - if sys.version_info < (3, 0): - print('Please use Python3+') - exit(1) - - if len(sys.argv) < 2: - print('Give names separated by a comma as input.') - sys.exit(1) - m = NameDataset() - names_list = sys.argv[1].split(',') - print('----- First names ----') - print('Name'.ljust(30), 'Present?') - for name in names_list: - # ljust just for aesthetic reasons ;) - print(str(name).ljust(30), m.search_first_name(name)) - - print('----- Last names ----') - print('Name'.ljust(30), 'Present?') - for name in names_list: - # ljust just for aesthetic reasons ;) - print(str(name).ljust(30), m.search_last_name(name)) - sys.exit(0)