Skip to content

Instantly share code, notes, and snippets.

@4ndr01d3
Created January 25, 2017 14:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save 4ndr01d3/95bcea650ff9823b2032e517ffe447b2 to your computer and use it in GitHub Desktop.
Save 4ndr01d3/95bcea650ff9823b2032e517ffe447b2 to your computer and use it in GitHub Desktop.
python scripts to compare solr vs elasticsearch
import unittest
import http.client
import json
import urllib.parse
class InterproSolrTest(unittest.TestCase):
server = "hmmer-prod-db01"
port = 9200
response_times = []
queries = {
"*:*": "Number of Docs per entry_db",
"protein_db:s": "Number of Swissprot Docs per entry_db",
"protein_db:t": "Number of Trembl Docs per entry_db",
"protein_acc:a0a0a2t3z9": "Number of Docs with protein Acc per entry_db",
"structure_acc:*": "Number of Docs with structure per entry_db",
"structure_acc:3nyw": "Number of Docs with structure Acc per entry_db",
"!entry_db:interpro%20AND%20!integrated:*": "Number of Unintegrated Docs per entry_db",
}
def setUp(self):
self._connection = http.client.HTTPConnection(self.server, self.port)
def tearDown(self):
self._connection.close()
@classmethod
def tearDownClass(self):
print("\n"+("*"*80))
print("-= ElasticSearch response times =-".center(80)+"\n")
for k,v in self.response_times:
print("{:>68}: {:>8}".format(k,v))
def test_solr_core_exists(self):
self._connection.request("GET", "/interpro")
response = self._connection.getresponse()
self.assertEqual(response.status, 200, "The response should be OK")
def test_solr_core_returns_json(self):
self._connection.request("GET", "/interpro/relationship/_search")
response = self._connection.getresponse()
data = response.read().decode()
try:
json.loads(data)
except json.JSONDecodeError:
self.fail("The document is not JSON")
def _elastic_json_query(self, q, query_obj):
self._connection.request(
"GET",
"/interpro/relationship/_search?pretty&q="+q,
json.dumps(query_obj)
)
response = self._connection.getresponse()
self.assertEqual(response.status, 200, "The response should be OK")
data = response.read().decode()
try:
obj = json.loads(data)
self.assertIn("took", obj)
self.assertIn("hits", obj)
self.assertIn("aggregations", obj)
return obj
except json.JSONDecodeError:
self.fail("The document is not JSON")
def test_number_of_docs_per_entry_db(self):
facet= {
"aggs" : {
"rscount": {
"terms": {
"field": "entry_db"
}
}
},
"size": 0
}
for q,tag in self.queries.items():
response = self._elastic_json_query(q, facet)
self.response_times.append((tag, response["took"]))
def test_number_of_unique_entries_per_entry_db(self):
facet= {
"aggs" : {
"rscount": {
"terms": {
"field": "entry_db"
},
"aggs": {
"unique_entries": {
"cardinality": {
"field": "entry_acc"
}
},
}
}
},
"size": 0
}
for q,tag in self.queries.items():
response = self._elastic_json_query(q, facet)
self.response_times.append((tag+" (unique entries)", response["took"]))
# self.response_times[tag+" (unique entries)"]=response["took"]
def test_number_of_unique_proteins_per_entry_db(self):
facet= {
"aggs" : {
"rscount": {
"terms": {
"field": "entry_db"
},
"aggs": {
"unique_proteins": {
"cardinality": {
"field": "protein_acc"
}
},
}
}
},
"size": 0
}
for q,tag in self.queries.items():
response = self._elastic_json_query(q, facet)
self.response_times.append((tag+" (unique proteins)", response["took"]))
# self.response_times[tag+" (unique proteins)"]=response["took"]
def test_grouping_entries(self):
fq = "{}:*%20AND%20{}_acc:{}".format("entry_acc", "protein", "protein_64440985")
for q,tag in self.queries.items():
response = self._elastic_group_query(q, "entry_acc", 1, 0)
self.response_times.append((tag+" (group)", response["took"]))
# self.response_times[tag+" (group)"]=response["took"]
response = self._elastic_group_query(q, "entry_acc", 1, 0, fq)
self.response_times.append((tag+" (group+fq)", response["took"]))
# self.response_times[tag+" (group+fq)"]=response["took"]
def _elastic_group_query(self, q, field, rows, start, fq=""):
query_obj = {
"size": 0,
"aggs":{
"by_entry":{
"terms": {
"field": field,
"size": rows
},
"aggs": {
"tops": {
"top_hits": { "size": 1}
}
}
}
}
}
if fq != "":
fq = "%20AND%20"+fq
self._connection.request(
"GET",
"/interpro/relationship/_search?pretty&q="+q+fq,
json.dumps(query_obj)
)
response = self._connection.getresponse()
self.assertEqual(response.status, 200, "The response should be OK")
data = response.read().decode()
try:
obj = json.loads(data)
self.assertIn("took", obj)
self.assertIn("hits", obj)
self.assertIn("aggregations", obj)
return obj
except json.JSONDecodeError:
self.fail("The document is not JSON")
if __name__ == '__main__':
unittest.main()
import unittest
import http.client
import json
import urllib.parse
class InterproSolrTest(unittest.TestCase):
server = "hmmer-prod-db02"
port = 8983
response_times = {}
queries = {
"*:*": "Number of Docs per entry_db",
"protein_db:swissprot": "Number of Swissprot Docs per entry_db",
"protein_db:trembl": "Number of Trembl Docs per entry_db",
"protein_acc:protein_64440985": "Number of Docs with protein Acc per entry_db",
"structure_acc:*": "Number of Docs with structure per entry_db",
"structure_acc:protein_32860": "Number of Docs with structure Acc per entry_db",
"!entry_db:interpro && !integrated:*": "Number of Unintegrated Docs per entry_db",
}
def setUp(self):
self._connection = http.client.HTTPConnection(self.server, self.port)
def tearDown(self):
self._connection.close()
@classmethod
def tearDownClass(self):
print("\n*********\n-= Solr response times =-\n")
for k,v in self.response_times.items():
print("{:>60}: {:>8}".format(k,v))
def test_solr_core_exists(self):
self._connection.request("GET", "/solr/generated/select")
response = self._connection.getresponse()
self.assertEqual(response.status, 200, "The response should be OK")
def test_solr_core_returns_json(self):
self._connection.request("GET", "/solr/generated/select?indent=on&wt=json")
response = self._connection.getresponse()
data = response.read().decode()
try:
json.loads(data)
except json.JSONDecodeError:
self.fail("The document is not JSON")
def _solr_json_query(self, q, json_facet):
params = {
"indent" : "on",
"wt": "json",
"q": q,
"rows": "0",
"facet": "on",
"json.facet": json_facet
}
self._connection.request("GET", "/solr/generated/select?"+urllib.parse.urlencode(params))
response = self._connection.getresponse()
self.assertEqual(response.status, 200, "The response should be OK")
data = response.read().decode()
try:
obj = json.loads(data)
self.assertIn("responseHeader", obj)
self.assertIn("response", obj)
self.assertIn("facets", obj)
return obj
except json.JSONDecodeError:
self.fail("The document is not JSON")
@unittest.skip("yes")
def test_number_of_docs_per_entry_db(self):
facet= {
"databases": {
"type": "terms",
"field": "entry_db",
}
}
for q,tag in self.queries.items():
response = self._solr_json_query(q, facet)
self.response_times[tag]=response["responseHeader"]["QTime"]
@unittest.skip("yes")
def test_number_of_unique_entries_per_entry_db(self):
facet= {
"databases": {
"type": "terms",
"field": "entry_db",
"facet": {
"unique": "unique(entry_acc)"
}
}
}
for q,tag in self.queries.items():
response = self._solr_json_query(q, facet)
self.response_times[tag+" (unique entries)"]=response["responseHeader"]["QTime"]
@unittest.skip("yes")
def test_number_of_unique_proteins_per_entry_db(self):
facet= {
"databases": {
"type": "terms",
"field": "entry_db",
"facet": {
"unique": "unique(protein_acc)"
}
}
}
for q,tag in self.queries.items():
response = self._solr_json_query(q, facet)
self.response_times[tag+" (unique proteins)"]=response["responseHeader"]["QTime"]
def test_grouping_entries(self):
fq = "{}:* && {}_acc:{}".format("entry_acc", "protein", "protein_64440985")
for q,tag in self.queries.items():
response = self._solr_group_query(q, "entry_acc", 1, 0)
self.response_times[tag+" (group)"]=response["responseHeader"]["QTime"]
response = self._solr_group_query(q, "entry_acc", 1, 0, fq)
self.response_times[tag+" (group+fq)"]=response["responseHeader"]["QTime"]
def _solr_group_query(self, q, field, rows, start, fq=None):
params = {
"indent" : "on",
"wt": "json",
'group': 'true',
'group.field': field,
'group.ngroups': 'true',
'rows': rows,
'start': start,
"q": q,
}
if fq is not None:
params['fq'] = fq.lower()
self._connection.request("GET", "/solr/generated/select?"+urllib.parse.urlencode(params))
response = self._connection.getresponse()
self.assertEqual(response.status, 200, "The response should be OK")
data = response.read().decode()
try:
obj = json.loads(data)
self.assertIn("responseHeader", obj)
self.assertIn("grouped", obj)
return obj
except json.JSONDecodeError:
self.fail("The document is not JSON")
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment