{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Using GRID and Dimensions together to identify Collaboration within a region" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from dslquery import dslquery\n", "import zipfile as zf\n", "import io\n", "import requests\n", "import geopy.distance" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## First download the GRID dataset, and extract the addresses.csv file\n", "you can get the GRID dataset from here: https://grid.ac/downloads\n", "Once unzipped, put the types.csv file in your notebook directory..." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## you can also download the latest release directly from Figshare...\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
defined_typedoigroup_ididpublished_datethumbtitleurlurl_private_apiurl_private_htmlurl_public_apiurl_public_html
0410.6084/m9.figshare.6683654.v1607266836542018-06-27T09:50:05ZGRID release 2018-06-25https://api.figshare.com/v2/articles/6683654https://api.figshare.com/v2/account/articles/6...https://figshare.com/account/articles/6683654https://api.figshare.com/v2/articles/6683654https://figshare.com/articles/GRID_release_201...
1410.6084/m9.figshare.6216392.v1607262163922018-05-03T13:02:09ZGRID release 2018-05-01https://api.figshare.com/v2/articles/6216392https://api.figshare.com/v2/account/articles/6...https://figshare.com/account/articles/6216392https://api.figshare.com/v2/articles/6216392https://figshare.com/articles/GRID_release_201...
\n", "
" ], "text/plain": [ " defined_type doi group_id id \\\n", "0 4 10.6084/m9.figshare.6683654.v1 6072 6683654 \n", "1 4 10.6084/m9.figshare.6216392.v1 6072 6216392 \n", "\n", " published_date thumb title \\\n", "0 2018-06-27T09:50:05Z GRID release 2018-06-25 \n", "1 2018-05-03T13:02:09Z GRID release 2018-05-01 \n", "\n", " url \\\n", "0 https://api.figshare.com/v2/articles/6683654 \n", "1 https://api.figshare.com/v2/articles/6216392 \n", "\n", " url_private_api \\\n", "0 https://api.figshare.com/v2/account/articles/6... \n", "1 https://api.figshare.com/v2/account/articles/6... \n", "\n", " url_private_html \\\n", "0 https://figshare.com/account/articles/6683654 \n", "1 https://figshare.com/account/articles/6216392 \n", "\n", " url_public_api \\\n", "0 https://api.figshare.com/v2/articles/6683654 \n", "1 https://api.figshare.com/v2/articles/6216392 \n", "\n", " url_public_html \n", "0 https://figshare.com/articles/GRID_release_201... \n", "1 https://figshare.com/articles/GRID_release_201... " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "grid_versions = pd.DataFrame(\n", " requests.get(\"https://api.figshare.com/v2/collections/3812929/articles\").json()\n", ").sort_values('published_date', ascending=False)\n", "\n", "grid_versions.head(2)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "grid_download_url = requests.get(grid_versions.loc[1]['url_public_api']).json()['files'][0]['download_url']\n", "\n", "with zf.ZipFile(\n", " io.BytesIO(requests.get(grid_download_url).content)\n", " ) as thezip:\n", " \n", " grid_addresses = pd.read_csv(thezip.open('full_tables/addresses.csv'),low_memory=False)\n", " grid_institutes = pd.read_csv(thezip.open('full_tables/institutes.csv'),low_memory=False)" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "grid_details = grid_institutes.merge(grid_addresses, left_on='grid_id', right_on='grid_id', how='outer')" ] }, { "cell_type": "code", "execution_count": 129, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
grid_idnamewikipedia_urlemail_addressestablishedline_1line_2line_3latlngpostcodeprimarycitystatestate_codecountrycountry_codegeonames_city_iddist_from_melbourne
0grid.1001.0Australian National Universityhttp://en.wikipedia.org/wiki/Australian_Nation...NaN1946.0NaNNaNNaN-35.277800149.120500NaNFalseCanberraAustralian Capital TerritoryAU-ACTAustraliaAU2172517.0465.706096
1grid.1002.3Monash Universityhttp://en.wikipedia.org/wiki/Monash_UniversityNaN1958.0NaNNaNNaN-37.908300145.138000NaNFalseMelbourneVictoriaAU-VICAustraliaAU2158177.019.944366
2grid.1003.2University of Queenslandhttp://en.wikipedia.org/wiki/University_of_Que...NaN1909.0NaNNaNNaN-27.495964153.009627NaNFalseBrisbaneQueenslandAU-QLDAustraliaAU2174003.01368.120576
3grid.1004.5Macquarie Universityhttp://en.wikipedia.org/wiki/Macquarie_UniversityNaN1964.0NaNNaNNaN-33.775259151.112915NaNFalseSydneyNew South WalesAU-NSWAustraliaAU2147714.0712.839058
4grid.1005.4UNSW Australiahttp://en.wikipedia.org/wiki/University_of_New...NaN1949.0NaNNaNNaN-33.917731151.230964NaNFalseSydneyNew South WalesAU-NSWAustraliaAU2147714.0711.126541
\n", "
" ], "text/plain": [ " grid_id name \\\n", "0 grid.1001.0 Australian National University \n", "1 grid.1002.3 Monash University \n", "2 grid.1003.2 University of Queensland \n", "3 grid.1004.5 Macquarie University \n", "4 grid.1005.4 UNSW Australia \n", "\n", " wikipedia_url email_address \\\n", "0 http://en.wikipedia.org/wiki/Australian_Nation... NaN \n", "1 http://en.wikipedia.org/wiki/Monash_University NaN \n", "2 http://en.wikipedia.org/wiki/University_of_Que... NaN \n", "3 http://en.wikipedia.org/wiki/Macquarie_University NaN \n", "4 http://en.wikipedia.org/wiki/University_of_New... NaN \n", "\n", " established line_1 line_2 line_3 lat lng postcode primary \\\n", "0 1946.0 NaN NaN NaN -35.277800 149.120500 NaN False \n", "1 1958.0 NaN NaN NaN -37.908300 145.138000 NaN False \n", "2 1909.0 NaN NaN NaN -27.495964 153.009627 NaN False \n", "3 1964.0 NaN NaN NaN -33.775259 151.112915 NaN False \n", "4 1949.0 NaN NaN NaN -33.917731 151.230964 NaN False \n", "\n", " city state state_code country country_code \\\n", "0 Canberra Australian Capital Territory AU-ACT Australia AU \n", "1 Melbourne Victoria AU-VIC Australia AU \n", "2 Brisbane Queensland AU-QLD Australia AU \n", "3 Sydney New South Wales AU-NSW Australia AU \n", "4 Sydney New South Wales AU-NSW Australia AU \n", "\n", " geonames_city_id dist_from_melbourne \n", "0 2172517.0 465.706096 \n", "1 2158177.0 19.944366 \n", "2 2174003.0 1368.120576 \n", "3 2147714.0 712.839058 \n", "4 2147714.0 711.126541 " ] }, "execution_count": 129, "metadata": {}, "output_type": "execute_result" } ], "source": [ "grid_details.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Calculate distance from a point (In this case, the University of Melbourne)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "institution = 'grid.1008.9'" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "def gridcoords(df,grid_id):\n", " rw = df[df['grid_id'] == grid_id]\n", " return (rw.iloc[0]['lat'],rw.iloc[0]['lng'])\n", "\n", "def getdistance(coord1,coord2):\n", " try:\n", " return geopy.distance.distance(coord1,coord2).km\n", " except:\n", " return None\n", "\n", "source_coords = gridcoords(grid_details,institution) \n", "\n", "grid_details['dist_from_melbourne'] = grid_details.apply (lambda row: getdistance(source_coords,\n", " (row['lat'],row['lng'])\n", " ),axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Identify Institutions within a 1 km radius of the University of Melbourne" ] }, { "cell_type": "code", "execution_count": 139, "metadata": {}, "outputs": [], "source": [ "local_grids = grid_details[grid_details['dist_from_melbourne'] < 1 ]" ] }, { "cell_type": "code", "execution_count": 140, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "29\n" ] } ], "source": [ "grids = list(local_grids.grid_id.unique())\n", "print(len(grids))" ] }, { "cell_type": "code", "execution_count": 156, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
grid_idnamewikipedia_urlemail_addressestablishedline_1line_2line_3latlngpostcodeprimarycitystatestate_codecountrycountry_codegeonames_city_iddist_from_melbourne
7grid.1008.9University of Melbournehttp://en.wikipedia.org/wiki/University_of_Mel...NaN1853.0NaNNaNNaN-37.797115144.959972NaNFalseMelbourneVictoriaAU-VICAustraliaAU2158177.00.000000
36grid.1042.7Walter and Eliza Hall Institute of Medical Res...https://en.wikipedia.org/wiki/Walter_and_Eliza...NaN1915.0NaNNaNNaN-37.798000144.956000NaNFalseMelbourneVictoriaAU-VICAustraliaAU2158177.00.363357
\n", "
" ], "text/plain": [ " grid_id name \\\n", "7 grid.1008.9 University of Melbourne \n", "36 grid.1042.7 Walter and Eliza Hall Institute of Medical Res... \n", "\n", " wikipedia_url email_address \\\n", "7 http://en.wikipedia.org/wiki/University_of_Mel... NaN \n", "36 https://en.wikipedia.org/wiki/Walter_and_Eliza... NaN \n", "\n", " established line_1 line_2 line_3 lat lng postcode \\\n", "7 1853.0 NaN NaN NaN -37.797115 144.959972 NaN \n", "36 1915.0 NaN NaN NaN -37.798000 144.956000 NaN \n", "\n", " primary city state state_code country country_code \\\n", "7 False Melbourne Victoria AU-VIC Australia AU \n", "36 False Melbourne Victoria AU-VIC Australia AU \n", "\n", " geonames_city_id dist_from_melbourne \n", "7 2158177.0 0.000000 \n", "36 2158177.0 0.363357 " ] }, "execution_count": 156, "metadata": {}, "output_type": "execute_result" } ], "source": [ "local_grids.head(2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Search for all publications produced by these institutions since 2010" ] }, { "cell_type": "code", "execution_count": 143, "metadata": {}, "outputs": [], "source": [ "def publicationsfromgrid(grids,limit=1000,pubskip='qpub'):\n", " searchstring = \"\"\"\n", " search publications\n", " where\n", " research_orgs.id in [{}]\n", " and year >= \"2010\"\n", " and id < \"{}\"\n", " return publications[id+doi+times_cited+year+author_affiliations] sort by id\n", " limit {}\n", " \"\"\".format(\",\".join([ '\"{}\"'.format(g) for g in grids]),pubskip,limit)\n", " #print (searchstring)\n", " return searchstring" ] }, { "cell_type": "code", "execution_count": 144, "metadata": {}, "outputs": [], "source": [ "def dslsearchpublications(grids):\n", " pubskip = 'q'\n", " pubs = []\n", " total_pubs = []\n", " result = {}\n", " while (pubskip == 'q') or (len(pubs) == 1000):\n", " pubs = dslquery(publicationsfromgrid(grids,pubskip=pubskip),0).get('publications',[])\n", " total_pubs += pubs\n", " pubskip = pubs[len(pubs)-1]['id'] \n", " \n", " \n", " return total_pubs" ] }, { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [], "source": [ "def publicationsfromgridlist(grids):\n", " pubs=[]\n", " \n", " idchunks = [grids[x:x + 499] for x in range(0, len(grids), 499)]\n", " for ids in idchunks:\n", " pubs += dslsearchpublications(ids)\n", " print(len(pubs))\n", " \n", " return pubs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "industry_pubs = publicationsfromgridlist(grids)" ] }, { "cell_type": "code", "execution_count": 147, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "70215" ] }, "execution_count": 147, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(industry_pubs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Citations by year of publication for the region" ] }, { "cell_type": "code", "execution_count": 148, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 148, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pd.DataFrame(industry_pubs).groupby(['year']).sum().plot(kind='bar')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Publications by year for the region" ] }, { "cell_type": "code", "execution_count": 149, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 149, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "pd.DataFrame(industry_pubs)[['id','year']]. \\\n", " groupby(['year']). \\\n", " count(). \\\n", " plot(kind='bar')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Filter Publications list for publications produced by two or more institutions in the region" ] }, { "cell_type": "code", "execution_count": 150, "metadata": {}, "outputs": [], "source": [ "industry_aff = [dict( \n", " year = p['year'],\n", " pubid = p['id'],\n", " grid = aff['id'],\n", " inst = aff['name']\n", " \n", " )\n", " for p in industry_pubs\n", " for auth in p.get('author_affiliations', [{}])[0]\n", " for aff in auth.get('affiliations',[])\n", " if aff.get('id','') in grids\n", " \n", "]" ] }, { "cell_type": "code", "execution_count": 151, "metadata": {}, "outputs": [], "source": [ "idf = pd.DataFrame(industry_aff).drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
instgridpubid
16University of Melbournegrid.1008.962186
13Royal Children's Hospitalgrid.416107.54356
14Royal Melbourne Hospitalgrid.416153.44193
20Walter and Eliza Hall Institute of Medical Res...grid.1042.72695
7Florey Institute of Neuroscience and Mental He...grid.418025.a1682
15Royal Women's Hospitalgrid.416259.d1243
10Melbourne Healthgrid.429299.d496
17Victorian Infectious Diseases Reference Labora...grid.433799.3479
11Melbourne Sexual Health Centregrid.490309.7251
8IBM Research - Australiagrid.481553.e163
4CO2CRCgrid.450289.2137
12Peter Doherty Institutegrid.483778.7134
5CRC for Spatial informationgrid.484046.a103
18Victorian Life Sciences Computation Initiativegrid.452643.274
2Australian and New Zealand Intensive Care Societygrid.489411.162
3BirdLife Australiagrid.478479.134
0Australia and New Zealand School of Governmentgrid.473712.416
19Victorian Responsible Gambling Foundationgrid.484726.b5
1Australian Mathematical Sciences Institutegrid.467229.c4
6Centre of Excellence for Particle Physics at t...grid.453169.c1
9Intensive Care Foundationgrid.479943.41
\n", "
" ], "text/plain": [ " inst grid pubid\n", "16 University of Melbourne grid.1008.9 62186\n", "13 Royal Children's Hospital grid.416107.5 4356\n", "14 Royal Melbourne Hospital grid.416153.4 4193\n", "20 Walter and Eliza Hall Institute of Medical Res... grid.1042.7 2695\n", "7 Florey Institute of Neuroscience and Mental He... grid.418025.a 1682\n", "15 Royal Women's Hospital grid.416259.d 1243\n", "10 Melbourne Health grid.429299.d 496\n", "17 Victorian Infectious Diseases Reference Labora... grid.433799.3 479\n", "11 Melbourne Sexual Health Centre grid.490309.7 251\n", "8 IBM Research - Australia grid.481553.e 163\n", "4 CO2CRC grid.450289.2 137\n", "12 Peter Doherty Institute grid.483778.7 134\n", "5 CRC for Spatial information grid.484046.a 103\n", "18 Victorian Life Sciences Computation Initiative grid.452643.2 74\n", "2 Australian and New Zealand Intensive Care Society grid.489411.1 62\n", "3 BirdLife Australia grid.478479.1 34\n", "0 Australia and New Zealand School of Government grid.473712.4 16\n", "19 Victorian Responsible Gambling Foundation grid.484726.b 5\n", "1 Australian Mathematical Sciences Institute grid.467229.c 4\n", "6 Centre of Excellence for Particle Physics at t... grid.453169.c 1\n", "9 Intensive Care Foundation grid.479943.4 1" ] }, "execution_count": 152, "metadata": {}, "output_type": "execute_result" } ], "source": [ "idf[['inst','grid','pubid']]. \\\n", " groupby(['inst','grid']). \\\n", " count(). \\\n", " reset_index(). \\\n", " sort_values(by='pubid', ascending=False)" ] }, { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [], "source": [ "ia = pd.DataFrame(industry_aff)\n", "\n", "def precinct_collab(series):\n", " precinct = [grid\n", " for grid in series.tolist()\n", " if grid in grids ]\n", " if len(set(precinct)) > 1:\n", " return 1\n", " else:\n", " return 0\n", " return \n", "\n", "iadf =ia.groupby(['pubid','year']). \\\n", " agg({'grid': [precinct_collab],\n", " \n", " }).reset_index()" ] }, { "cell_type": "code", "execution_count": 157, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.6/site-packages/pandas/core/generic.py:3108: PerformanceWarning: dropping on a non-lexsorted multi-index without a level parameter may impact performance.\n", " obj = obj._drop_axis(labels, axis, level=level, errors=errors)\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 157, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "iadf.groupby('year').sum().plot(kind='bar')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }