{ "cells": [ { "cell_type": "markdown", "id": "079a070f", "metadata": {}, "source": [ "This dataset is downloaded from [Mapping China's Tech Giants](https://chinatechmap.aspi.org.au/#/homepage)." ] }, { "cell_type": "code", "execution_count": 1, "id": "cb2d71b6", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "pd.set_option('display.max_columns', None)" ] }, { "cell_type": "code", "execution_count": 27, "id": "367e5a7d", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"data.csv\")" ] }, { "cell_type": "code", "execution_count": 28, "id": "a7eaca36", "metadata": {}, "outputs": [], "source": [ "df_usa = df[df.country_id.str.contains(\"United States\", na=False)]" ] }, { "cell_type": "code", "execution_count": 29, "id": "505acb82", "metadata": {}, "outputs": [], "source": [ "df_filter = df_usa[['company_id','type_of_company_id_ref','label','infrastructure_type_id','Primary topic']]" ] }, { "cell_type": "code", "execution_count": 30, "id": "606c5b48", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/angeline_jcq/.pyenv/versions/3.9.7/lib/python3.9/site-packages/pandas/core/generic.py:5516: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " self[name] = value\n", "/var/folders/9m/w7ffkstj051dnyjnyklwvwmr0000gn/T/ipykernel_82579/1061540503.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_filter['Primary topic'] = df_filter['Primary topic'].str.split(pat=\"|\",expand=True)[1]\n" ] } ], "source": [ "df_filter.company_id = df_filter.company_id.str.split(pat=\"|\",expand=True)[1]\n", "df_filter.type_of_company_id_ref = df_filter.type_of_company_id_ref.str.split(pat=\"|\",expand=True)[1]\n", "df_filter.infrastructure_type_id = df_filter.infrastructure_type_id.str.split(pat=\"|\",expand=True)[1]\n", "df_filter['Primary topic'] = df_filter['Primary topic'].str.split(pat=\"|\",expand=True)[1]" ] }, { "cell_type": "code", "execution_count": 32, "id": "38e8d223", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " Artificial Intelligence 97\n", " Biotech 52\n", " Covid-19 34\n", " Surveillance 22\n", " Cloud 17\n", " ePayments 13\n", " 5G 8\n", " Smart cities 2\n", " US Entity List 1\n", "Name: Primary topic, dtype: int64" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_filter['Primary topic'].value_counts()" ] }, { "cell_type": "code", "execution_count": 34, "id": "817b99ee", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
company_idtype_of_company_id_reflabelinfrastructure_type_idPrimary topic
10HuaweiTelecommunicationsKeysightR&D lab5G
93HuaweiTelecommunicationsUC BerkeleyResearch partnershipNaN
94HuaweiTelecommunicationsUCLAResearch partnershipNaN
95HuaweiTelecommunicationsUniversity of Illinois at Urbana-ChampaignResearch partnershipNaN
96HuaweiTelecommunicationsMassachusetts Institute of TechnologyResearch partnershipNaN
\n", "
" ], "text/plain": [ " company_id type_of_company_id_ref \\\n", "10 Huawei Telecommunications \n", "93 Huawei Telecommunications \n", "94 Huawei Telecommunications \n", "95 Huawei Telecommunications \n", "96 Huawei Telecommunications \n", "\n", " label infrastructure_type_id \\\n", "10 Keysight R&D lab \n", "93 UC Berkeley Research partnership \n", "94 UCLA Research partnership \n", "95 University of Illinois at Urbana-Champaign Research partnership \n", "96 Massachusetts Institute of Technology Research partnership \n", "\n", " Primary topic \n", "10 5G \n", "93 NaN \n", "94 NaN \n", "95 NaN \n", "96 NaN " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_filter.head()" ] }, { "cell_type": "code", "execution_count": 35, "id": "45716dae", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
company_idtype_of_company_id_reflabelinfrastructure_type_idPrimary topic
2665AlibabaNaNBravado partnershipCommercial partnershipNaN
2689AlibabaNaNAryaka partnershipCommercial partnershipCloud
2702AlibabaNaNBeyond Meat partnershipCommercial partnershipNaN
2717AlibabaNaNEquinix dealCommercial partnershipCloud
2723AlibabaNaNAscenda partnershipCommercial partnershipNaN
..................
2611iFlytekNaNU.S. Department of Commerce export license to ...HealthCovid-19
4042iFlytekNaNChartwell Strategy GroupPR or lobbyingNaN
2007iFlytekArtificial IntelligenceRutgers UniversityR&D labArtificial Intelligence
1998iFlytekArtificial IntelligenceGeorgia Institute of TechnologyResearch partnershipArtificial Intelligence
2009iFlytekArtificial IntelligenceMassachusetts Institute of TechnologyResearch partnershipArtificial Intelligence
\n", "

594 rows × 5 columns

\n", "
" ], "text/plain": [ " company_id type_of_company_id_ref \\\n", "2665 Alibaba NaN \n", "2689 Alibaba NaN \n", "2702 Alibaba NaN \n", "2717 Alibaba NaN \n", "2723 Alibaba NaN \n", "... ... ... \n", "2611 iFlytek NaN \n", "4042 iFlytek NaN \n", "2007 iFlytek Artificial Intelligence \n", "1998 iFlytek Artificial Intelligence \n", "2009 iFlytek Artificial Intelligence \n", "\n", " label \\\n", "2665 Bravado partnership \n", "2689 Aryaka partnership \n", "2702 Beyond Meat partnership \n", "2717 Equinix deal \n", "2723 Ascenda partnership \n", "... ... \n", "2611 U.S. Department of Commerce export license to ... \n", "4042 Chartwell Strategy Group \n", "2007 Rutgers University \n", "1998 Georgia Institute of Technology \n", "2009 Massachusetts Institute of Technology \n", "\n", " infrastructure_type_id Primary topic \n", "2665 Commercial partnership NaN \n", "2689 Commercial partnership Cloud \n", "2702 Commercial partnership NaN \n", "2717 Commercial partnership Cloud \n", "2723 Commercial partnership NaN \n", "... ... ... \n", "2611 Health Covid-19 \n", "4042 PR or lobbying NaN \n", "2007 R&D lab Artificial Intelligence \n", "1998 Research partnership Artificial Intelligence \n", "2009 Research partnership Artificial Intelligence \n", "\n", "[594 rows x 5 columns]" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_filter.sort_values(by=['company_id','infrastructure_type_id'], ascending=True)" ] }, { "cell_type": "code", "execution_count": 37, "id": "831989ec", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " Huawei 67\n", " DJI 58\n", " Baidu 52\n", " WuXi AppTec Group 47\n", " China Telecom 45\n", "Name: company_id, dtype: int64" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_filter.company_id.value_counts().head()" ] }, { "cell_type": "code", "execution_count": 46, "id": "642a8180", "metadata": {}, "outputs": [], "source": [ "top_five = [\" Huawei\", \" DJI\",\" Baidu\",\" WuXi AppTec Group\",\" China Telecom\"]\n", "df_five = df_filter[df_filter.company_id.isin(top_five)].sort_values(by=['company_id','infrastructure_type_id'], ascending=True)\n", "df_five.to_csv(\"data_filter_five.csv\",index=False)" ] }, { "cell_type": "code", "execution_count": 65, "id": "6db8defb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Baidu Lab 2\n", "Intel 1\n", "Coral Genomics 1\n", "Snark 1\n", "Sherlock 1\n", "Zensors 1\n", "Veo Robotics, Inc. 1\n", "Simple Machines, Inc. 1\n", "Kebotix 1\n", "DeepBiome Therapeutics 1\n", "Near Protocol 1\n", "Correlia Biosystems 1\n", "Cornami 1\n", "TrustGo acquisition 1\n", "Avail 1\n", "BlackRock Inc. 1\n", "Renaissance Technologies Corp 1\n", "Integrity Partners Inc. 1\n", "Peninsula Capital 1\n", "Sidley Austin LLP 1\n", "Comet Labs 1\n", "xPerception 1\n", "Baidu Venture office 1\n", "US subsidiary 1\n", "AutomationHero 1\n", "Jeeva Wireless 1\n", "Ford 1\n", "Atomwise 1\n", "Qualcomm Technologies, Inc. 1\n", "Effie Awards 1\n", "Ford partnership 1\n", "Nvidia partnership 1\n", "Intel partnership 1\n", "Microsoft partnership 1\n", "Sectigo partnership 1\n", "Snap Inc partnership 1\n", "Tesla partnership 1\n", "Amazon collaboration 1\n", "Engine Biosciences 1\n", "Lunewave 1\n", "Baidu Ventures 1\n", "Velodyne Lidar 1\n", "Uber 1\n", "Lightelligence Inc 1\n", "Strateos 1\n", "Quantapore 1\n", "Probius Dx 1\n", "Rain Neuromorphics 1\n", "CubeWorks 1\n", "RootPath Genomics 1\n", "Partnership on AI 1\n", "Name: label, dtype: int64" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_five[df_five.company_id == ' Baidu'].label.value_counts()" ] }, { "cell_type": "code", "execution_count": 57, "id": "b18d0682", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
company_idtype_of_company_id_reflabelinfrastructure_type_idPrimary topic
97HuaweiTelecommunicationsPrinceton UniversityDonationNaN
461HuaweiTelecommunicationsPrinceton UniversityResearch partnershipNaN
\n", "
" ], "text/plain": [ " company_id type_of_company_id_ref label \\\n", "97 Huawei Telecommunications Princeton University \n", "461 Huawei Telecommunications Princeton University \n", "\n", " infrastructure_type_id Primary topic \n", "97 Donation NaN \n", "461 Research partnership NaN " ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_five[(df_five.company_id == ' Huawei') & (df_five.label == 'Princeton University')]" ] }, { "cell_type": "code", "execution_count": 62, "id": "f557a8a5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " Health 12\n", " Investment or joint venture 12\n", " Subsidiary 9\n", " Overseas Office 7\n", " Investor 2\n", " Manufacturing facility 2\n", " R&D lab 2\n", " Research partnership 1\n", "Name: infrastructure_type_id, dtype: int64" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_five[df_five.company_id == ' WuXi AppTec Group'].infrastructure_type_id.value_counts()" ] }, { "cell_type": "code", "execution_count": 66, "id": "76521b29", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " Commercial partnership 158\n", " Investment or joint venture 86\n", " Overseas Office 57\n", " Research partnership 53\n", " Data centre 49\n", " PR or lobbying 37\n", " Health 33\n", " R&D lab 26\n", " Subsidiary 25\n", " Investor 16\n", " Surveillance equipment 14\n", " Donation 13\n", " Telecommunications or ICT 8\n", " MoU-agreement 4\n", " Manufacturing facility 2\n", " Security inspection equipment 2\n", " Facial recognition 1\n", " Smart City-Public Security project 1\n", " Training 1\n", "Name: infrastructure_type_id, dtype: int64" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_filter.infrastructure_type_id.value_counts()" ] }, { "cell_type": "code", "execution_count": 68, "id": "4c6e429c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(269, 5)" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_five.shape" ] }, { "cell_type": "code", "execution_count": 33, "id": "05603c29", "metadata": {}, "outputs": [], "source": [ "df_filter.to_csv(\"data_filter.csv\",index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }