{
"cells": [
{
"cell_type": "markdown",
"id": "079a070f",
"metadata": {},
"source": [
"This dataset is downloaded from [Mapping China's Tech Giants](https://chinatechmap.aspi.org.au/#/homepage)."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "cb2d71b6",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "367e5a7d",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"data.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "a7eaca36",
"metadata": {},
"outputs": [],
"source": [
"df_usa = df[df.country_id.str.contains(\"United States\", na=False)]"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "505acb82",
"metadata": {},
"outputs": [],
"source": [
"df_filter = df_usa[['company_id','type_of_company_id_ref','label','infrastructure_type_id','Primary topic']]"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "606c5b48",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/angeline_jcq/.pyenv/versions/3.9.7/lib/python3.9/site-packages/pandas/core/generic.py:5516: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" self[name] = value\n",
"/var/folders/9m/w7ffkstj051dnyjnyklwvwmr0000gn/T/ipykernel_82579/1061540503.py:4: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_filter['Primary topic'] = df_filter['Primary topic'].str.split(pat=\"|\",expand=True)[1]\n"
]
}
],
"source": [
"df_filter.company_id = df_filter.company_id.str.split(pat=\"|\",expand=True)[1]\n",
"df_filter.type_of_company_id_ref = df_filter.type_of_company_id_ref.str.split(pat=\"|\",expand=True)[1]\n",
"df_filter.infrastructure_type_id = df_filter.infrastructure_type_id.str.split(pat=\"|\",expand=True)[1]\n",
"df_filter['Primary topic'] = df_filter['Primary topic'].str.split(pat=\"|\",expand=True)[1]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "38e8d223",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
" Artificial Intelligence 97\n",
" Biotech 52\n",
" Covid-19 34\n",
" Surveillance 22\n",
" Cloud 17\n",
" ePayments 13\n",
" 5G 8\n",
" Smart cities 2\n",
" US Entity List 1\n",
"Name: Primary topic, dtype: int64"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_filter['Primary topic'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "817b99ee",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" company_id | \n",
" type_of_company_id_ref | \n",
" label | \n",
" infrastructure_type_id | \n",
" Primary topic | \n",
"
\n",
" \n",
" \n",
" \n",
" 10 | \n",
" Huawei | \n",
" Telecommunications | \n",
" Keysight | \n",
" R&D lab | \n",
" 5G | \n",
"
\n",
" \n",
" 93 | \n",
" Huawei | \n",
" Telecommunications | \n",
" UC Berkeley | \n",
" Research partnership | \n",
" NaN | \n",
"
\n",
" \n",
" 94 | \n",
" Huawei | \n",
" Telecommunications | \n",
" UCLA | \n",
" Research partnership | \n",
" NaN | \n",
"
\n",
" \n",
" 95 | \n",
" Huawei | \n",
" Telecommunications | \n",
" University of Illinois at Urbana-Champaign | \n",
" Research partnership | \n",
" NaN | \n",
"
\n",
" \n",
" 96 | \n",
" Huawei | \n",
" Telecommunications | \n",
" Massachusetts Institute of Technology | \n",
" Research partnership | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" company_id type_of_company_id_ref \\\n",
"10 Huawei Telecommunications \n",
"93 Huawei Telecommunications \n",
"94 Huawei Telecommunications \n",
"95 Huawei Telecommunications \n",
"96 Huawei Telecommunications \n",
"\n",
" label infrastructure_type_id \\\n",
"10 Keysight R&D lab \n",
"93 UC Berkeley Research partnership \n",
"94 UCLA Research partnership \n",
"95 University of Illinois at Urbana-Champaign Research partnership \n",
"96 Massachusetts Institute of Technology Research partnership \n",
"\n",
" Primary topic \n",
"10 5G \n",
"93 NaN \n",
"94 NaN \n",
"95 NaN \n",
"96 NaN "
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_filter.head()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "45716dae",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" company_id | \n",
" type_of_company_id_ref | \n",
" label | \n",
" infrastructure_type_id | \n",
" Primary topic | \n",
"
\n",
" \n",
" \n",
" \n",
" 2665 | \n",
" Alibaba | \n",
" NaN | \n",
" Bravado partnership | \n",
" Commercial partnership | \n",
" NaN | \n",
"
\n",
" \n",
" 2689 | \n",
" Alibaba | \n",
" NaN | \n",
" Aryaka partnership | \n",
" Commercial partnership | \n",
" Cloud | \n",
"
\n",
" \n",
" 2702 | \n",
" Alibaba | \n",
" NaN | \n",
" Beyond Meat partnership | \n",
" Commercial partnership | \n",
" NaN | \n",
"
\n",
" \n",
" 2717 | \n",
" Alibaba | \n",
" NaN | \n",
" Equinix deal | \n",
" Commercial partnership | \n",
" Cloud | \n",
"
\n",
" \n",
" 2723 | \n",
" Alibaba | \n",
" NaN | \n",
" Ascenda partnership | \n",
" Commercial partnership | \n",
" NaN | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 2611 | \n",
" iFlytek | \n",
" NaN | \n",
" U.S. Department of Commerce export license to ... | \n",
" Health | \n",
" Covid-19 | \n",
"
\n",
" \n",
" 4042 | \n",
" iFlytek | \n",
" NaN | \n",
" Chartwell Strategy Group | \n",
" PR or lobbying | \n",
" NaN | \n",
"
\n",
" \n",
" 2007 | \n",
" iFlytek | \n",
" Artificial Intelligence | \n",
" Rutgers University | \n",
" R&D lab | \n",
" Artificial Intelligence | \n",
"
\n",
" \n",
" 1998 | \n",
" iFlytek | \n",
" Artificial Intelligence | \n",
" Georgia Institute of Technology | \n",
" Research partnership | \n",
" Artificial Intelligence | \n",
"
\n",
" \n",
" 2009 | \n",
" iFlytek | \n",
" Artificial Intelligence | \n",
" Massachusetts Institute of Technology | \n",
" Research partnership | \n",
" Artificial Intelligence | \n",
"
\n",
" \n",
"
\n",
"
594 rows × 5 columns
\n",
"
"
],
"text/plain": [
" company_id type_of_company_id_ref \\\n",
"2665 Alibaba NaN \n",
"2689 Alibaba NaN \n",
"2702 Alibaba NaN \n",
"2717 Alibaba NaN \n",
"2723 Alibaba NaN \n",
"... ... ... \n",
"2611 iFlytek NaN \n",
"4042 iFlytek NaN \n",
"2007 iFlytek Artificial Intelligence \n",
"1998 iFlytek Artificial Intelligence \n",
"2009 iFlytek Artificial Intelligence \n",
"\n",
" label \\\n",
"2665 Bravado partnership \n",
"2689 Aryaka partnership \n",
"2702 Beyond Meat partnership \n",
"2717 Equinix deal \n",
"2723 Ascenda partnership \n",
"... ... \n",
"2611 U.S. Department of Commerce export license to ... \n",
"4042 Chartwell Strategy Group \n",
"2007 Rutgers University \n",
"1998 Georgia Institute of Technology \n",
"2009 Massachusetts Institute of Technology \n",
"\n",
" infrastructure_type_id Primary topic \n",
"2665 Commercial partnership NaN \n",
"2689 Commercial partnership Cloud \n",
"2702 Commercial partnership NaN \n",
"2717 Commercial partnership Cloud \n",
"2723 Commercial partnership NaN \n",
"... ... ... \n",
"2611 Health Covid-19 \n",
"4042 PR or lobbying NaN \n",
"2007 R&D lab Artificial Intelligence \n",
"1998 Research partnership Artificial Intelligence \n",
"2009 Research partnership Artificial Intelligence \n",
"\n",
"[594 rows x 5 columns]"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_filter.sort_values(by=['company_id','infrastructure_type_id'], ascending=True)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "831989ec",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
" Huawei 67\n",
" DJI 58\n",
" Baidu 52\n",
" WuXi AppTec Group 47\n",
" China Telecom 45\n",
"Name: company_id, dtype: int64"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_filter.company_id.value_counts().head()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "642a8180",
"metadata": {},
"outputs": [],
"source": [
"top_five = [\" Huawei\", \" DJI\",\" Baidu\",\" WuXi AppTec Group\",\" China Telecom\"]\n",
"df_five = df_filter[df_filter.company_id.isin(top_five)].sort_values(by=['company_id','infrastructure_type_id'], ascending=True)\n",
"df_five.to_csv(\"data_filter_five.csv\",index=False)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "6db8defb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Baidu Lab 2\n",
"Intel 1\n",
"Coral Genomics 1\n",
"Snark 1\n",
"Sherlock 1\n",
"Zensors 1\n",
"Veo Robotics, Inc. 1\n",
"Simple Machines, Inc. 1\n",
"Kebotix 1\n",
"DeepBiome Therapeutics 1\n",
"Near Protocol 1\n",
"Correlia Biosystems 1\n",
"Cornami 1\n",
"TrustGo acquisition 1\n",
"Avail 1\n",
"BlackRock Inc. 1\n",
"Renaissance Technologies Corp 1\n",
"Integrity Partners Inc. 1\n",
"Peninsula Capital 1\n",
"Sidley Austin LLP 1\n",
"Comet Labs 1\n",
"xPerception 1\n",
"Baidu Venture office 1\n",
"US subsidiary 1\n",
"AutomationHero 1\n",
"Jeeva Wireless 1\n",
"Ford 1\n",
"Atomwise 1\n",
"Qualcomm Technologies, Inc. 1\n",
"Effie Awards 1\n",
"Ford partnership 1\n",
"Nvidia partnership 1\n",
"Intel partnership 1\n",
"Microsoft partnership 1\n",
"Sectigo partnership 1\n",
"Snap Inc partnership 1\n",
"Tesla partnership 1\n",
"Amazon collaboration 1\n",
"Engine Biosciences 1\n",
"Lunewave 1\n",
"Baidu Ventures 1\n",
"Velodyne Lidar 1\n",
"Uber 1\n",
"Lightelligence Inc 1\n",
"Strateos 1\n",
"Quantapore 1\n",
"Probius Dx 1\n",
"Rain Neuromorphics 1\n",
"CubeWorks 1\n",
"RootPath Genomics 1\n",
"Partnership on AI 1\n",
"Name: label, dtype: int64"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_five[df_five.company_id == ' Baidu'].label.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "b18d0682",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" company_id | \n",
" type_of_company_id_ref | \n",
" label | \n",
" infrastructure_type_id | \n",
" Primary topic | \n",
"
\n",
" \n",
" \n",
" \n",
" 97 | \n",
" Huawei | \n",
" Telecommunications | \n",
" Princeton University | \n",
" Donation | \n",
" NaN | \n",
"
\n",
" \n",
" 461 | \n",
" Huawei | \n",
" Telecommunications | \n",
" Princeton University | \n",
" Research partnership | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" company_id type_of_company_id_ref label \\\n",
"97 Huawei Telecommunications Princeton University \n",
"461 Huawei Telecommunications Princeton University \n",
"\n",
" infrastructure_type_id Primary topic \n",
"97 Donation NaN \n",
"461 Research partnership NaN "
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_five[(df_five.company_id == ' Huawei') & (df_five.label == 'Princeton University')]"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "f557a8a5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
" Health 12\n",
" Investment or joint venture 12\n",
" Subsidiary 9\n",
" Overseas Office 7\n",
" Investor 2\n",
" Manufacturing facility 2\n",
" R&D lab 2\n",
" Research partnership 1\n",
"Name: infrastructure_type_id, dtype: int64"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_five[df_five.company_id == ' WuXi AppTec Group'].infrastructure_type_id.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "76521b29",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
" Commercial partnership 158\n",
" Investment or joint venture 86\n",
" Overseas Office 57\n",
" Research partnership 53\n",
" Data centre 49\n",
" PR or lobbying 37\n",
" Health 33\n",
" R&D lab 26\n",
" Subsidiary 25\n",
" Investor 16\n",
" Surveillance equipment 14\n",
" Donation 13\n",
" Telecommunications or ICT 8\n",
" MoU-agreement 4\n",
" Manufacturing facility 2\n",
" Security inspection equipment 2\n",
" Facial recognition 1\n",
" Smart City-Public Security project 1\n",
" Training 1\n",
"Name: infrastructure_type_id, dtype: int64"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_filter.infrastructure_type_id.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "4c6e429c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(269, 5)"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_five.shape"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "05603c29",
"metadata": {},
"outputs": [],
"source": [
"df_filter.to_csv(\"data_filter.csv\",index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}