Skip to content

lodc

KG_FORMAT_TERMS = {KGFormat.RDF: ['rdf'], KGFormat.TURTLE: ['turtle', 'ttl'], KGFormat.NTRIPLES: ['.nt', 'ntriples', 'n-triples'], KGFormat.NQUADS: ['nq', 'nquads', 'n-quads'], KGFormat.JSONLD: ['jsonld', 'json-ld'], KGFormat.OWL: ['owl']} module-attribute

KG_TERMS = ['rdf', 'turtle', 'ntriples', 'n-triples', 'owl', 'nquads', 'n-quads', 'jsonld', 'json-ld', '.nt', '.ttl', '.rdf', '.nq', '.jsonld', '.json', '.owl'] module-attribute

KGFormat

Bases: Enum

Source code in datasets/views/lodc.py
11
12
13
14
15
16
17
class KGFormat(Enum):
    RDF = "rdf"
    TURTLE = "turtle"
    NTRIPLES = "nt"
    NQUADS = "nq"
    JSONLD = "jsonld"
    OWL = "owl"

JSONLD = 'jsonld' class-attribute

NQUADS = 'nq' class-attribute

NTRIPLES = 'nt' class-attribute

OWL = 'owl' class-attribute

RDF = 'rdf' class-attribute

TURTLE = 'turtle' class-attribute

preprocess_dataset(dataset)

Source code in datasets/views/lodc.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def preprocess_dataset(dataset: dict):
    full_downloads = dataset.get('full_download', [])
    other_downloads = dataset.get('other_download', [])
    sparql = dataset.get('sparql', [])

    full_downloads = [preprocess_download(d) for d in full_downloads]
    other_downloads = [preprocess_download(d) for d in other_downloads]
    sparql = [preprocess_download(d) for d in sparql]

    n_downloads_available, n_downloads_kg, n_downloads_maybekg, n_kg_available = 0, 0, 0, 0
    for d in it.chain(iter(full_downloads), iter(other_downloads), iter(sparql)):
        n_downloads_available += d['available']
        if d['detect_kg'] == 'YES':
            n_downloads_kg += 1
        if d['detect_kg'] != 'NO':
            n_downloads_maybekg += 1
            if d['available']:
                n_kg_available += 1

    dataset['n_downloads_available'] = n_downloads_available

    return {
        **dataset,
        'full_download': full_downloads,
        'other_download': other_downloads,
        'sparql': sparql,
        'n_downloads_available': n_downloads_available,
        'n_downloads_kg': n_downloads_kg,
        'n_downloads_maybekg': n_downloads_maybekg,
        'n_kg_available': n_kg_available,
    }

preprocess_download(download)

Source code in datasets/views/lodc.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def preprocess_download(download: dict):
    title = download.get('title', '')
    status = download.get('status', None)
    url = download.get('download_url', None) or download.get('access_url', None)
    available = url and (status is None or status == "OK")
    media_type = download.get('media_type', '')

    corpus = ' '.join([media_type, title, url or '']).lower()

    maybe_kg = (
        'html' not in media_type.lower()
        and 'sitemap' not in media_type.lower()
    )
    is_kg = url and (
        'void' not in corpus
        or any(term in corpus for term in KG_TERMS)
    )

    return {
        **download,
        'url': url,
        'available': available,
        'detect_kg': 'YES' if is_kg else ('MAYBE' if maybe_kg else 'NO'),
    }

proxy_lodc_api(request)

Source code in datasets/views/lodc.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
@api_view(['GET'])
def proxy_lodc_api(request: Request):
    datasets = cache.get('lodc_datasets')

    if datasets is None:
        response = requests.get('https://lod-cloud.net/lod-data.json', stream=True, verify=False)
        datasets = response.json()
        for k, v in datasets.items():
            try:
                datasets[k] = preprocess_dataset(v)
            except Exception:
                pass

        cache.set('lodc_datasets', datasets, timeout=60 * 60 * 24)

    return JsonResponse(datasets)