Implementation of SWEM(Simple Word-Embedding-based Models)
Baseline Needs More Love: On Simple Word-Embedding-Based Models and Associated Pooling Mechanisms (ACL 2018)
pip install swem
Examples are available in examples directory.
from typing import List
import numpy as np
import swem
from gensim.models import KeyedVectors
if __name__ == '__main__':
kv: KeyedVectors = KeyedVectors(vector_size=200)
tokens: List[str] = ['I', 'have', 'a', 'pen']
embed: np.ndarray = swem.infer_vector(
tokens=tokens, kv=kv, method='concat'
)
print(embed.shape)
from typing import List
import swem
from gensim.models import KeyedVectors
if __name__ == '__main__':
kv: KeyedVectors = KeyedVectors(vector_size=200)
swem_embed = swem.SWEM(kv)
tokens: List[str] = ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち']
embed = swem_embed.infer_vector(tokens, method='max')
print(embed.shape)
Results
(200,)
from typing import List
import swem
from gensim.models import KeyedVectors
if __name__ == '__main__':
kv: KeyedVectors = KeyedVectors(vector_size=200)
swem_embed = swem.SWEM(kv)
tokens: List[str] = ['This', 'is', 'an', 'implementation', 'of', 'SWEM']
embed = swem_embed.infer_vector(tokens, method='max')
print(embed.shape)
Results
(200,)
SWEM generates random vector when given token is out of vocaburary. To reproduce token's embeddings, you need to set seed of NumPy.
from typing import List
import numpy as np
import swem
from gensim.models import KeyedVectors
if __name__ == '__main__':
np.random.seed(0)
kv: KeyedVectors = KeyedVectors(vector_size=200)
tokens: List[str] = ['I', 'have', 'a', 'pen']
embed: np.ndarray = swem.infer_vector(
tokens=tokens, kv=kv, method='concat'
)
print(embed.shape)
import swem
swem.download_w2v(lang='ja')
kv = swem.load_w2v(lang='ja')
Downloading w2v file to /Users/<username>/.swem/ja.zip
Extract zipfile into /Users/<username>/.swem/ja
Success to extract files.