Skip to content

Instantly share code, notes, and snippets.

View Witiko's full-sized avatar

Vít Starý Novotný Witiko

  • Masaryk University
  • Brno, Czech Republic
View GitHub Profile
@Witiko
Witiko / interpret_soft_cosine_measure.py
Created March 14, 2021 09:25
Interprets the soft cosine measure in Gensim 4 as a sum of word pair similarities
def interpret_soft_cosine_measure(doc1, doc2, dictionary, similarity_matrix):
word_pair_importances = dict()
for word1_id, word1_weight in doc1:
for word2_id, word2_weight in doc2:
word_similarity = similarity_matrix.matrix[word1_id, word2_id]
word_pair_importance = word1_weight * word_similarity * word2_weight
if word_pair_importance == 0:
continue
word1 = dictionary.id2token[word1_id]
word2 = dictionary.id2token[word2_id]
@Witiko
Witiko / evaluate-speed-pie-chart.py
Created October 16, 2020 21:53
Creates a pie chart from a GNU Parallel joblog after running OCR-D
# -*- coding:utf-8 -*-
from itertools import dropwhile
import json
import re
import sys
import matplotlib.pyplot as plt
#!/bin/sh
# Produces mean amount of financial support by extracting project codes from a PDF document and querying starfos.tacr.cz.
#
# Usage: ./get-mean-tacr-support.sh FILE, where FILE is a PDF document with a table of supported projects, such as
# https://www.tacr.cz/wp-content/uploads/documents/2019/10/29/1572358378_Vyhlaseni_vysledku_eTA_na_web_-_podporene.pdf
set -e
pdfgrep TL[0-9]+ "$1" |
sed -r 's/.*\s(TL[0-9]+)(\s.*|$)/\1/' |
{
"embeddings": [
{
"tensorName": "The soft VSM with non-regularized word embeddings on the TWITTER dataset",
"tensorShape": [
3108,
3
],
"tensorPath": "https://gist.githubusercontent.com/Witiko/860f86ca52c89ee97714371ac2a91a62/raw/8df9801310d78223e67520fad47ba2cc7db0ac2d/docsim-dense_scm-twitter-1-False-True-True-800--1.0-2-vectors.csv",
"metadataPath": "https://gist.githubusercontent.com/Witiko/860f86ca52c89ee97714371ac2a91a62/raw/8df9801310d78223e67520fad47ba2cc7db0ac2d/docsim-dense_scm-twitter-1-False-True-True-800--1.0-2-metadata.csv"
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
Neutral
Neutral
Neutral
Positive
Neutral
Neutral
Negative
Neutral
Neutral
Neutral
{
"embeddings": [
{
"tensorName": "The soft VSM with regularized word embeddings on the REUTERS dataset",
"tensorShape": [
7674,
3
],
"tensorPath": "https://gist.githubusercontent.com/Witiko/f071349d0aa864ef9d4ab32618d7a7a0/raw/79b3c0e0754cc98d287467b7490389164c5ed08c/docsim-sparse_scm-reuters-1-True-True-True-200--1.0-2-vectors.csv",
"metadataPath": "https://gist.githubusercontent.com/Witiko/a8fe62af9538fd3e265b155f2f6c1c2e/raw/b7a24d91a30d1b63fc80c4ccb06d1929a10307ff/docsim-dense_scm-reuters-1-True-True-True-200--1.0-2-metadata.csv"
{
"embeddings": [
{
"tensorName": "The soft VSM with non-regularized word embeddings on the REUTERS dataset",
"tensorShape": [
7674,
3
],
"tensorPath": "https://gist.githubusercontent.com/Witiko/01d2cba2b70394994e9676b767fb5b54/raw/77828cea827ef2e85ff1728e9f7a164a7f948994/docsim-dense_scm-reuters-1-True-True-True-200--1.0-2-vectors.csv",
"metadataPath": "https://gist.githubusercontent.com/Witiko/a8fe62af9538fd3e265b155f2f6c1c2e/raw/b7a24d91a30d1b63fc80c4ccb06d1929a10307ff/docsim-dense_scm-reuters-1-True-True-True-200--1.0-2-metadata.csv"
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
-1.7881473302841187 -21.638370513916016 -8.322571754455566
11.161980628967285 -4.62389612197876 10.291177749633789
8.381392478942871 2.9024336338043213 9.12906551361084
0.03650572896003723 16.332168579101562 10.112354278564453
-3.538588285446167 20.933395385742188 1.6825966835021973
4.521414279937744 -19.92893409729004 3.049839973449707
7.068723678588867 0.31095945835113525 9.246502876281738
-0.6464729905128479 2.4970457553863525 1.924015998840332
-6.410751819610596 0.5469326376914978 6.407222747802734
-14.470321655273438 -5.763935089111328 8.814469337463379
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
6.424992084503174 -6.662110805511475 21.37932586669922
-19.867834091186523 8.192985534667969 6.94021463394165
-9.401764869689941 5.466089725494385 8.77171516418457
-3.0411410331726074 18.178956985473633 16.64122200012207
5.479282379150391 17.74678611755371 7.453356742858887
-1.0430454015731812 -27.974584579467773 2.0146751403808594
-6.216945171356201 7.838143348693848 11.96973705291748
16.152389526367188 0.28177687525749207 9.88743782043457
5.373088359832764 19.697660446166992 0.1268618255853653
-1.7863715887069702 19.714889526367188 -8.857694625854492
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
Crude
Earn
Earn
Money-fx
Trade
Earn
Earn
Acq
Acq
Acq