Data
parent
34a86c1942
commit
569b5b2ef3
25
README.md
25
README.md
|
@ -12,32 +12,9 @@
|
|||
- High Precision / Recall
|
||||
- Worldwide Names
|
||||
|
||||
## Installation
|
||||
```
|
||||
npm install name-dataset
|
||||
```
|
||||
|
||||
The NPM package is not available yet, to be published shortly.
|
||||
|
||||
## Usage
|
||||
|
||||
```js
|
||||
const Names = require('name-dataset')
|
||||
```
|
||||
|
||||
<img src='assets/img_1.png'/>
|
||||
|
||||
## How reliable is it?
|
||||
|
||||
Well, it depends if you are looking for a high recall or a high precision. For example, the word Rose can be either a name or a noun. If we include it in the list, then we increase the precision but we decrease the recall. And vice versa, if it's not in the list. The library checks that the word starts with a capital letter. In our case, we emphasize more on precision. So I would say the best use case here is to check whether it's a name or not based on a prior knowledge that the customer has submitted a name.
|
||||
|
||||
Here is an example on a (old) text: [ALI BABA AND THE FORTY THIEVES](http://textfiles.com/stories/ab40thv.txt).
|
||||
|
||||
<img src='assets/img_2.png'/>
|
||||
|
||||
## Dataset Generation
|
||||
|
||||
[generate.sh](name-dataset/blob/master/generation/generate.sh)
|
||||
[generate.sh](name-dataset/blob/master/generate/index.sh)
|
||||
|
||||
- [listofrandomnames.com](http://listofrandomnames.com/index.cfm?generated)
|
||||
- [sajari.com 5000 Names around the Globe](https://www.sajari.com/public-data)
|
||||
|
|
BIN
assets/img_1.png
BIN
assets/img_1.png
Binary file not shown.
Before Width: | Height: | Size: 38 KiB |
BIN
assets/img_2.png
BIN
assets/img_2.png
Binary file not shown.
Before Width: | Height: | Size: 476 KiB |
|
@ -1,3 +0,0 @@
|
|||
from names_dataset.query import NameDataset
|
||||
|
||||
__version__ = '1.9.1'
|
|
@ -1,52 +0,0 @@
|
|||
import os
|
||||
|
||||
|
||||
def _query(search_set, key, use_upper_case):
|
||||
if use_upper_case and key.title() != key:
|
||||
return False
|
||||
return key.strip().lower() in search_set
|
||||
|
||||
|
||||
class NameDataset:
|
||||
FIRST_NAME_SEARCH = 0
|
||||
LAST_NAME_SEARCH = 1
|
||||
|
||||
def __init__(self):
|
||||
first_names_filename = os.path.join(os.path.dirname(__file__), 'first_names.all.txt')
|
||||
with open(first_names_filename, 'r', errors='ignore', encoding='utf8') as r:
|
||||
self.first_names = set(r.read().strip().split('\n'))
|
||||
last_names_filename = os.path.join(os.path.dirname(__file__), 'last_names.all.txt')
|
||||
with open(last_names_filename, 'r', errors='ignore', encoding='utf8') as r:
|
||||
self.last_names = set(r.read().strip().split('\n'))
|
||||
|
||||
def search_first_name(self, first_name, use_upper_case=False):
|
||||
return _query(self.first_names, first_name, use_upper_case)
|
||||
|
||||
def search_last_name(self, last_name, use_upper_case=False):
|
||||
return _query(self.last_names, last_name, use_upper_case)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
|
||||
if sys.version_info < (3, 0):
|
||||
print('Please use Python3+')
|
||||
exit(1)
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print('Give names separated by a comma as input.')
|
||||
sys.exit(1)
|
||||
m = NameDataset()
|
||||
names_list = sys.argv[1].split(',')
|
||||
print('----- First names ----')
|
||||
print('Name'.ljust(30), 'Present?')
|
||||
for name in names_list:
|
||||
# ljust just for aesthetic reasons ;)
|
||||
print(str(name).ljust(30), m.search_first_name(name))
|
||||
|
||||
print('----- Last names ----')
|
||||
print('Name'.ljust(30), 'Present?')
|
||||
for name in names_list:
|
||||
# ljust just for aesthetic reasons ;)
|
||||
print(str(name).ljust(30), m.search_last_name(name))
|
||||
sys.exit(0)
|
Loading…
Reference in New Issue