atlas_of_worldstars
/
2_process_asterism_locations.ipynb
608 строк · 21.5 Кб
1{
2"cells": [
3{
4"cell_type": "code",
5"execution_count": 1,
6"metadata": {},
7"outputs": [],
8"source": [
9"import numpy as np\n",
10"import pandas as pd\n",
11"import glob, os"
12]
13},
14{
15"cell_type": "code",
16"execution_count": 2,
17"metadata": {},
18"outputs": [
19{
20"name": "stdout",
21"output_type": "stream",
22"text": [
23"pandas 0.23.4\n",
24"numpy 1.15.4\n",
25"watermark 1.8.1\n",
26"ELEANOR LUTZ 2019-07-26 \n",
27"\n",
28"CPython 3.7.1\n",
29"IPython 7.2.0\n",
30"\n",
31"compiler : MSC v.1900 64 bit (AMD64)\n",
32"system : Windows\n",
33"release : 10\n",
34"machine : AMD64\n",
35"processor : Intel64 Family 6 Model 63 Stepping 2, GenuineIntel\n",
36"CPU cores : 12\n",
37"interpreter: 64bit\n"
38]
39}
40],
41"source": [
42"# Watermark is not required for this code, but is included for information. \n",
43"import watermark\n",
44"%load_ext watermark\n",
45"%watermark -a \"ELEANOR LUTZ\" -d -v -iv -m"
46]
47},
48{
49"cell_type": "code",
50"execution_count": 3,
51"metadata": {},
52"outputs": [
53{
54"name": "stdout",
55"output_type": "stream",
56"text": [
57"['arabic', 'arabic_moon_stations', 'armintxe', 'aztec', 'belarusian', 'boorong', 'chinese_medieval', 'dakota', 'egyptian', 'hawaiian_starlines', 'indian', 'inuit', 'japanese_moon_stations', 'kamilaroi', 'korean', 'lokono', 'macedonian', 'maori', 'maya', 'mongolian', 'mulapin', 'navajo', 'norse', 'northern_andes', 'ojibwe', 'romanian', 'sami', 'sardinian', 'seleucid', 'siberian', 'tongan', 'tukano', 'tupi', 'western']\n"
58]
59}
60],
61"source": [
62"# Read in processed star data (created in 1_process_starbase_data.ipynb)\n",
63"df_stars = pd.read_csv(\"./data/processed/hygdata_processed.csv\", low_memory=False)\n",
64"\n",
65"# Exclude \"culture\" folders that are replicated in other folders. \n",
66"# \"chinese_medieval\" and \"western\" are used instead of the alternates in the list below (\"exclude\") \n",
67"exclude = ['western_SnT', 'western_hlad', 'western_rey', 'chinese_contemporary', 'chinese']\n",
68"fabfiles = glob.glob('./data/skycultures/*/')\n",
69"fabfiles = [x.split(\"\\\\\")[-2] for x in fabfiles]\n",
70"fabfiles = [x for x in fabfiles if x not in exclude]\n",
71"\n",
72"# Print all cultures that will be analyzed in this Jupyter notebook\n",
73"print(fabfiles)"
74]
75},
76{
77"cell_type": "code",
78"execution_count": 4,
79"metadata": {},
80"outputs": [
81{
82"name": "stdout",
83"output_type": "stream",
84"text": [
85"---DONE---\n"
86]
87}
88],
89"source": [
90"'''\n",
91"Hacky shortcut to process .fab files so that Pandas can open variable-length\n",
92"rows as comma-delimited. \n",
93"Just puts longest row at top so Pandas doesn't encounter unexpected columns.\n",
94"'''\n",
95"for name in fabfiles: \n",
96" fname = './data/skycultures/'+name+'/constellationship.fab'\n",
97" savename = './data/processed/skycultures/sorted_constellations/'+name+'_sorted.csv'\n",
98" df = pd.read_csv(fname, header=None)\n",
99" # For some reason newer files are tab delimited with spacer tabs\n",
100" if name in ['mulapin', 'seleucid']:\n",
101" df['len'] = df[0].str.strip().str.len()\n",
102" df = df.sort_values(by='len', ascending=False)\n",
103" del df['len']\n",
104" else: \n",
105" df['len'] = df[0].str.count(' ')\n",
106" df = df.sort_values(by='len', ascending=False)\n",
107" del df['len']\n",
108" df.to_csv(savename, index=False, header=None)\n",
109" df.head()\n",
110" \n",
111"print('---DONE---')"
112]
113},
114{
115"cell_type": "code",
116"execution_count": 5,
117"metadata": {},
118"outputs": [],
119"source": [
120"def get_loc(vals, to_find, df_stars):\n",
121" ''' \n",
122" Use the HYG v3.0 database to convert the asterism star IDs into \n",
123" right ascension and declination values\n",
124" '''\n",
125" vals = [x for x in vals if str(x) != 'nan']\n",
126" if ',' not in str(vals[0]):\n",
127" vallist = []\n",
128" for val in vals:\n",
129" if str(val) != 'nan':\n",
130" df_val = df_stars[df_stars['hip'] == val]\n",
131" missing = {'78727': '144069'}\n",
132" if len(df_val) >= 1:\n",
133" vallist.append(df_val.iloc[0][to_find])\n",
134" elif val in missing.keys():\n",
135" # Manually fill in the two stars that are missing for some reason\n",
136" # SOURCE: Hipparcos catalog. Mapped to HD ID identifier. \n",
137" # http://tdc-www.harvard.edu/catalogs/hipparcos.html\n",
138" # https://www.cosmos.esa.int/web/hipparcos/search-facility\n",
139" val = missing.get(str(int(val)))\n",
140" print(val)\n",
141" df_val = df_stars[df_stars['hd'] == val]\n",
142" vallist.append(df_val.iloc[0][to_find])\n",
143" vallist = [str(x) for x in vallist]\n",
144" str1 = ', '.join(vallist)\n",
145" return str1\n",
146" \n",
147"def get_scatter_size_cultures(val, max_count = 35):\n",
148" '''\n",
149" Get scatter point size for individual stars by the number of times they \n",
150" appear in the dataset. \n",
151" '''\n",
152" return max_count + val ** 2"
153]
154},
155{
156"cell_type": "code",
157"execution_count": 6,
158"metadata": {},
159"outputs": [
160{
161"data": {
162"text/html": [
163"<div>\n",
164"<style scoped>\n",
165" .dataframe tbody tr th:only-of-type {\n",
166" vertical-align: middle;\n",
167" }\n",
168"\n",
169" .dataframe tbody tr th {\n",
170" vertical-align: top;\n",
171" }\n",
172"\n",
173" .dataframe thead th {\n",
174" text-align: right;\n",
175" }\n",
176"</style>\n",
177"<table border=\"1\" class=\"dataframe\">\n",
178" <thead>\n",
179" <tr style=\"text-align: right;\">\n",
180" <th></th>\n",
181" <th>star_ID</th>\n",
182" <th>count</th>\n",
183" <th>ra</th>\n",
184" <th>dec</th>\n",
185" <th>size</th>\n",
186" </tr>\n",
187" </thead>\n",
188" <tbody>\n",
189" <tr>\n",
190" <th>0</th>\n",
191" <td>26727</td>\n",
192" <td>28</td>\n",
193" <td>5.679313</td>\n",
194" <td>-1.942572</td>\n",
195" <td>819</td>\n",
196" </tr>\n",
197" <tr>\n",
198" <th>1</th>\n",
199" <td>25930</td>\n",
200" <td>28</td>\n",
201" <td>5.533445</td>\n",
202" <td>-0.299092</td>\n",
203" <td>819</td>\n",
204" </tr>\n",
205" <tr>\n",
206" <th>2</th>\n",
207" <td>26311</td>\n",
208" <td>26</td>\n",
209" <td>5.603559</td>\n",
210" <td>-1.20192</td>\n",
211" <td>711</td>\n",
212" </tr>\n",
213" <tr>\n",
214" <th>4</th>\n",
215" <td>21421</td>\n",
216" <td>24</td>\n",
217" <td>4.598677</td>\n",
218" <td>16.509301</td>\n",
219" <td>611</td>\n",
220" </tr>\n",
221" <tr>\n",
222" <th>3</th>\n",
223" <td>17499</td>\n",
224" <td>24</td>\n",
225" <td>3.747927</td>\n",
226" <td>24.113339</td>\n",
227" <td>611</td>\n",
228" </tr>\n",
229" </tbody>\n",
230"</table>\n",
231"</div>"
232],
233"text/plain": [
234" star_ID count ra dec size\n",
235"0 26727 28 5.679313 -1.942572 819\n",
236"1 25930 28 5.533445 -0.299092 819\n",
237"2 26311 26 5.603559 -1.20192 711\n",
238"4 21421 24 4.598677 16.509301 611\n",
239"3 17499 24 3.747927 24.113339 611"
240]
241},
242"metadata": {},
243"output_type": "display_data"
244},
245{
246"name": "stdout",
247"output_type": "stream",
248"text": [
249"0 null values in ra data\n"
250]
251}
252],
253"source": [
254"'''\n",
255"Process star ID data into RA/DEC coordinates by asterism and star.\n",
256"\n",
257"The final data from this cell (star_df) includes:\n",
258"1. the stellar ID for each star represented in an asterism (star_ID)\n",
259"2. the number of cultures that use that star (count),\n",
260"3. the right ascension of the star (ra)\n",
261"4. the declination of the star (dec)\n",
262"5. the arbitrary size at which to plot the star in my map design (size)\n",
263"'''\n",
264"\n",
265"starlist = [] # keep track of all stars included in all cultures combined\n",
266"\n",
267"for name in fabfiles: \n",
268" fname = './data/processed/skycultures/sorted_constellations/'+name+'_sorted.csv'\n",
269" savename = './data/processed/skycultures/constellations_ra_dec/'+name+'_ra_dec.csv'\n",
270" df_new = pd.read_csv(fname, delim_whitespace=True, header=None, encoding='utf8', engine='python')\n",
271"\n",
272" asts = df_new[0].tolist()\n",
273" vals = df_new[1].tolist()\n",
274" df_new.drop(df_new.columns[[0, 1]], axis=1, inplace=True)\n",
275"\n",
276" starlist_temp = [item for sublist in df_new.values.tolist() for item in sublist]\n",
277" starlist += set(starlist_temp)\n",
278"\n",
279" dec = df_new.apply(lambda row: get_loc(row, to_find='dec', df_stars=df_stars), axis=1).tolist()\n",
280" ras = df_new.apply(lambda row: get_loc(row, to_find='ra', df_stars=df_stars), axis=1).tolist()\n",
281"\n",
282" df_new['ra'] = ras\n",
283" df_new['dec'] = dec\n",
284" df_new['ast_ID'] = asts\n",
285" df_new['vals'] = vals\n",
286"\n",
287" df_new = df_new[['ast_ID', 'vals', 'ra', 'dec']]\n",
288" df_new = df_new.sort_values(by='ast_ID')\n",
289" df_new.to_csv(savename, index=None)\n",
290"\n",
291"starlist = [int(x) for x in starlist if str(x) != 'nan']\n",
292"starlist_savename = './data/processed/starlist_cultures_design.csv'\n",
293"\n",
294"starlist_df = pd.DataFrame({'star_ID': starlist})\n",
295"star_df = pd.DataFrame(starlist_df['star_ID'].value_counts().reset_index())\n",
296"star_df.columns = ['star_ID', 'count']\n",
297"star_df['ra'] = star_df['star_ID'].apply(lambda x: get_loc([x], to_find='ra', df_stars=df_stars))\n",
298"star_df['dec'] = star_df['star_ID'].apply(lambda x: get_loc([x], to_find='dec', df_stars=df_stars))\n",
299"star_df['size'] = star_df['count'].apply(get_scatter_size_cultures)\n",
300"\n",
301"star_df = star_df.sort_values(by='count', ascending=False)\n",
302"star_df.to_csv(starlist_savename, index=False)\n",
303"display(star_df.head())\n",
304"\n",
305"# Check that all data was successfully converted\n",
306"print(star_df['ra'].isnull().sum(), 'null values in ra data')"
307]
308},
309{
310"cell_type": "code",
311"execution_count": 7,
312"metadata": {},
313"outputs": [
314{
315"data": {
316"text/html": [
317"<div>\n",
318"<style scoped>\n",
319" .dataframe tbody tr th:only-of-type {\n",
320" vertical-align: middle;\n",
321" }\n",
322"\n",
323" .dataframe tbody tr th {\n",
324" vertical-align: top;\n",
325" }\n",
326"\n",
327" .dataframe thead th {\n",
328" text-align: right;\n",
329" }\n",
330"</style>\n",
331"<table border=\"1\" class=\"dataframe\">\n",
332" <thead>\n",
333" <tr style=\"text-align: right;\">\n",
334" <th></th>\n",
335" <th>star_ID</th>\n",
336" <th>count</th>\n",
337" <th>ra</th>\n",
338" <th>dec</th>\n",
339" <th>size</th>\n",
340" <th>culture</th>\n",
341" </tr>\n",
342" </thead>\n",
343" <tbody>\n",
344" <tr>\n",
345" <th>1252</th>\n",
346" <td>110023</td>\n",
347" <td>1</td>\n",
348" <td>22.285139</td>\n",
349" <td>-5.387164</td>\n",
350" <td>36</td>\n",
351" <td>NaN</td>\n",
352" </tr>\n",
353" <tr>\n",
354" <th>1253</th>\n",
355" <td>10623</td>\n",
356" <td>1</td>\n",
357" <td>2.279251</td>\n",
358" <td>83.561414</td>\n",
359" <td>36</td>\n",
360" <td>NaN</td>\n",
361" </tr>\n",
362" <tr>\n",
363" <th>1254</th>\n",
364" <td>100751</td>\n",
365" <td>1</td>\n",
366" <td>20.427459</td>\n",
367" <td>-56.735090</td>\n",
368" <td>36</td>\n",
369" <td>NaN</td>\n",
370" </tr>\n",
371" <tr>\n",
372" <th>1255</th>\n",
373" <td>117299</td>\n",
374" <td>1</td>\n",
375" <td>23.783866</td>\n",
376" <td>57.451359</td>\n",
377" <td>36</td>\n",
378" <td>NaN</td>\n",
379" </tr>\n",
380" <tr>\n",
381" <th>1256</th>\n",
382" <td>25142</td>\n",
383" <td>1</td>\n",
384" <td>5.380556</td>\n",
385" <td>3.544452</td>\n",
386" <td>36</td>\n",
387" <td>NaN</td>\n",
388" </tr>\n",
389" </tbody>\n",
390"</table>\n",
391"</div>"
392],
393"text/plain": [
394" star_ID count ra dec size culture\n",
395"1252 110023 1 22.285139 -5.387164 36 NaN\n",
396"1253 10623 1 2.279251 83.561414 36 NaN\n",
397"1254 100751 1 20.427459 -56.735090 36 NaN\n",
398"1255 117299 1 23.783866 57.451359 36 NaN\n",
399"1256 25142 1 5.380556 3.544452 36 NaN"
400]
401},
402"metadata": {},
403"output_type": "display_data"
404}
405],
406"source": [
407"'''\n",
408"For all stars that are only included in one culture, \n",
409"go back and find the culture that contains the star. \n",
410"(This isn't the most efficient way to do this but it's good enough.) \n",
411"'''\n",
412"\n",
413"fname = './data/processed/starlist_cultures_design.csv'\n",
414"savename = './data/processed/starlist_cultures_design_single.csv'\n",
415"fnames = glob.glob('./culture_star_data/*/constellationship_sorted.csv')\n",
416"\n",
417"df = pd.read_csv(fname)\n",
418"df = df[df['count'] == 1]\n",
419"df['culture'] = np.nan\n",
420"\n",
421"if not os.path.exists(savename): # Inefficient so don't run code by accident\n",
422" for index, row in df.iterrows():\n",
423" star = row['star_ID']\n",
424" for name in fabfiles: \n",
425" fname = './data/processed/skycultures/sorted_constellations/'+name+'_sorted.csv'\n",
426" df_new = pd.read_csv(fname, delim_whitespace=True, header=None, encoding='utf8', engine='python')\n",
427" df_new.drop(df_new.columns[[0, 1]], axis=1, inplace=True)\n",
428" starlist_temp = [item for sublist in df_new.values.tolist() for item in sublist]\n",
429" starlist_temp = set(starlist_temp)\n",
430" if star in starlist_temp:\n",
431" df.loc[index, 'culture'] = name\n",
432" df.to_csv(savename, index=False)\n",
433" \n",
434"display(df.head())"
435]
436},
437{
438"cell_type": "code",
439"execution_count": 8,
440"metadata": {},
441"outputs": [
442{
443"data": {
444"text/html": [
445"<div>\n",
446"<style scoped>\n",
447" .dataframe tbody tr th:only-of-type {\n",
448" vertical-align: middle;\n",
449" }\n",
450"\n",
451" .dataframe tbody tr th {\n",
452" vertical-align: top;\n",
453" }\n",
454"\n",
455" .dataframe thead th {\n",
456" text-align: right;\n",
457" }\n",
458"</style>\n",
459"<table border=\"1\" class=\"dataframe\">\n",
460" <thead>\n",
461" <tr style=\"text-align: right;\">\n",
462" <th></th>\n",
463" <th>star_ID_pair</th>\n",
464" <th>culture</th>\n",
465" <th>ras</th>\n",
466" <th>decs</th>\n",
467" </tr>\n",
468" </thead>\n",
469" <tbody>\n",
470" <tr>\n",
471" <th>0</th>\n",
472" <td>7588, 9007</td>\n",
473" <td>western</td>\n",
474" <td>1.628556, 1.932564</td>\n",
475" <td>-57.236757, -51.608896</td>\n",
476" </tr>\n",
477" <tr>\n",
478" <th>1</th>\n",
479" <td>9007, 10602</td>\n",
480" <td>western</td>\n",
481" <td>1.932564, 2.275154</td>\n",
482" <td>-51.608896, -51.512165</td>\n",
483" </tr>\n",
484" <tr>\n",
485" <th>2</th>\n",
486" <td>10602, 11407</td>\n",
487" <td>western</td>\n",
488" <td>2.275154, 2.449755</td>\n",
489" <td>-51.512165, -47.70384</td>\n",
490" </tr>\n",
491" <tr>\n",
492" <th>3</th>\n",
493" <td>11407, 12413</td>\n",
494" <td>western</td>\n",
495" <td>2.449755, 2.663326</td>\n",
496" <td>-47.70384, -42.89167</td>\n",
497" </tr>\n",
498" <tr>\n",
499" <th>4</th>\n",
500" <td>12413, 12486</td>\n",
501" <td>western</td>\n",
502" <td>2.663326, 2.677781</td>\n",
503" <td>-42.89167, -39.855375</td>\n",
504" </tr>\n",
505" </tbody>\n",
506"</table>\n",
507"</div>"
508],
509"text/plain": [
510" star_ID_pair culture ras decs\n",
511"0 7588, 9007 western 1.628556, 1.932564 -57.236757, -51.608896\n",
512"1 9007, 10602 western 1.932564, 2.275154 -51.608896, -51.512165\n",
513"2 10602, 11407 western 2.275154, 2.449755 -51.512165, -47.70384\n",
514"3 11407, 12413 western 2.449755, 2.663326 -47.70384, -42.89167\n",
515"4 12413, 12486 western 2.663326, 2.677781 -42.89167, -39.855375"
516]
517},
518"execution_count": 8,
519"metadata": {},
520"output_type": "execute_result"
521}
522],
523"source": [
524"'''\n",
525"Combine all culture data into a dataframe by star ID pair.\n",
526"\n",
527"The final data from this cell (df_main) includes:\n",
528"1. The two star IDs in that vector line, in alphabetical order from smallest to largest ID (star_ID_pair)\n",
529"2. the culture responsible for that line (culture)\n",
530" NOTE: each line is repeated for as many times as there are cultures\n",
531"3. the right ascensions of the two stars (ras)\n",
532"4. the declinations of the two stars (decs)\n",
533"'''\n",
534"\n",
535"savename = './data/processed/star_pairs.csv'\n",
536"df_main = pd.DataFrame({'star_ID_pair':[], 'culture':[], 'ras':[], 'decs':[]})\n",
537"\n",
538"for culture in fabfiles: \n",
539" fname = './data/processed/skycultures/sorted_constellations/'+culture+'_sorted.csv'\n",
540" star_ID_pairs, cultures = [], []\n",
541" df = pd.read_csv(fname, delim_whitespace=True, header=None, encoding='utf8', engine='python')\n",
542" namelist = []\n",
543" for index, row in df.iterrows():\n",
544" temp_list = [x for x in row.tolist() if str(x) != 'nan'] # remove extra cells\n",
545" if len(temp_list) > 0: \n",
546" name = temp_list[0]\n",
547" temp_list = temp_list[2:] # remove asterism number and number of stars\n",
548" # some errors in chinese medieval where last star is repeated\n",
549" if not (len(temp_list) % 2 == 0) or (len(temp_list) == 1):\n",
550" if temp_list[-1] == temp_list[-2]:\n",
551" temp_list = temp_list[:-1]\n",
552" assert (len(temp_list) % 2 == 0) or (len(temp_list) == 1)\n",
553" temp_list = [int(x) for x in temp_list] # all strings and non-numbers should now be gone\n",
554" temp_list = [temp_list[i:i+2] for i in range(0, len(temp_list), 2)] # break into line pairs\n",
555" temp_list = [sorted(x) for x in temp_list] # order line pairs smallest to largest\n",
556" star_ID_pairs += temp_list\n",
557" namelist.append(name)\n",
558"\n",
559" star_ID_1_dec = [get_loc([x[0]], to_find='dec', df_stars=df_stars) for x in star_ID_pairs]\n",
560" star_ID_2_dec = [get_loc([x[1]], to_find='dec', df_stars=df_stars) for x in star_ID_pairs]\n",
561" star_ID_1_ra = [get_loc([x[0]], to_find='ra', df_stars=df_stars) for x in star_ID_pairs]\n",
562" star_ID_2_ra = [get_loc([x[1]], to_find='ra', df_stars=df_stars) for x in star_ID_pairs]\n",
563"\n",
564" ras = [[x, y] for x, y, in zip(star_ID_1_ra, star_ID_2_ra)]\n",
565" decs = [[x, y] for x, y, in zip(star_ID_1_dec, star_ID_2_dec)]\n",
566"\n",
567" star_ID_pairs = [\"\".join(c for c in str(x) if c not in \"'[]\") for x in star_ID_pairs]\n",
568" ras = [\"\".join(c for c in str(x) if c not in \"'[]\") for x in ras]\n",
569" decs = [\"\".join(c for c in str(x) if c not in \"'[]\") for x in decs]\n",
570"\n",
571" df = pd.DataFrame({'star_ID_pair': star_ID_pairs, 'culture': [culture]*len(star_ID_pairs),\n",
572" 'ras': ras, 'decs': decs})\n",
573" df_main = pd.concat([df, df_main])\n",
574" \n",
575"df_main.to_csv(savename, index=False)\n",
576"df_main.head()"
577]
578},
579{
580"cell_type": "code",
581"execution_count": null,
582"metadata": {},
583"outputs": [],
584"source": []
585}
586],
587"metadata": {
588"kernelspec": {
589"display_name": "Python 3",
590"language": "python",
591"name": "python3"
592},
593"language_info": {
594"codemirror_mode": {
595"name": "ipython",
596"version": 3
597},
598"file_extension": ".py",
599"mimetype": "text/x-python",
600"name": "python",
601"nbconvert_exporter": "python",
602"pygments_lexer": "ipython3",
603"version": "3.7.1"
604}
605},
606"nbformat": 4,
607"nbformat_minor": 2
608}
609