Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2This module handles getting data from the MCD by scraping the cgi interface. 

3 

4We simply pass parameters up in the url, like the web version interface does. 

5Then we scrape the resulting web page for the link to the data and (optionally) 

6the image[s]. 

7 

8 

9Note that this is a simple scraper and is not in any sense affiliated with the 

10MCD project. Please do not run it against the server too often or 

11unreasonably. Where possible use the saved output (this is why we provide a 

12saved output). 

13""" 

14import random 

15from collections import namedtuple 

16from logging import getLogger 

17from pathlib import Path 

18import time 

19from typing import Union 

20 

21import requests 

22from bs4 import BeautifulSoup 

23from requests.exceptions import ConnectionError 

24 

25logger = getLogger(__name__) 

26 

27 

28base_params = { 

29 "datekeyhtml": 1, 

30 "ls": 85.3, 

31 "localtime": 0.0, # noqa 

32 "year": None, 

33 "month": None, 

34 "day": None, 

35 "hours": None, 

36 "minutes": None, 

37 "seconds": None, 

38 "julian": None, 

39 "martianyear": None, 

40 "martianmonth": None, 

41 "sol": None, 

42 "latitude": "all", 

43 "longitude": "all", 

44 "altitude": 10.0, 

45 "zkey": 3, 

46 "isfixedlt": "off", 

47 "dust": 1, 

48 "hrkey": 1, 

49 "zonmean": "off", 

50 "var1": "mtot", 

51 "var2": "t", 

52 "var3": "p", 

53 "var4": "none", 

54 "dpi": 80, 

55 "islog": "off", 

56 "colorm": "Blues", 

57 "minval": None, 

58 "maxval": None, 

59 "proj": "cyl", 

60 "plat": None, 

61 "plon": None, 

62 "trans": None, 

63 "iswind": "off", 

64 "latpoint": None, 

65 "lonpoint": None, 

66} 

67"""Parameters which can be passed to the server. Defaults set here are 

68extracted from the web interface. Any parameter set to `None` will not be 

69passed. To pass `"none"` use a string. Do not override this dict directly; 

70rather pass the parameter and value as keyword arguments to `fetch_data()`.""" 

71 

72 

73urlbase = "http://www-mars.lmd.jussieu.fr/mcd_python/" 

74url = urlbase + "cgi-bin/mcdcgi.py" 

75_FetchedFiles = namedtuple("_FetchedFiles", ["dataf", "imgf"]) 

76 

77 

78def generate_fn(**params) -> str: 

79 """ 

80 Generate a unique filename from given params. 

81 

82 This function is used 

83 internally with the parameters used by `fetch_data()`. It is provided here 

84 in case you need to generate the filename from a given set of params. 

85 

86 Args: 

87 **params: params to consider. 

88 

89 Returns: 

90 (str): Fn from params. 

91 """ 

92 fn = "-".join(f"{k}_{x}" for k, x in params.items() if x is not None) 

93 return f"marsdata_{fn}.txt" 

94 

95 

96class FetchingError(Exception): 

97 """ 

98 Error fetching resource. 

99 

100 The server returns `200` with an html error 

101 message, so we raise an exception and pass the error message up. 

102 """ 

103 

104 

105def get(*args, max_wait: int = 30, **kwargs) -> requests.Response: 

106 """ 

107 Get with exponential backoff. 

108 

109 Args: 

110 *args: Args for requests.get 

111 max_wait (int): Max seconds to wait. (Default value = 30) 

112 **kwargs: Kwargs for requests.get 

113 

114 Returns: 

115 (requests.Response): response 

116 

117 Raises: 

118 ConnectionError: if unable to connect. 

119 

120 

121 """ 

122 start = time.monotonic() 

123 max_wait *= 1000 

124 i = 0 

125 while time.monotonic() - start < max_wait: 

126 try: 

127 return requests.get(*args, **kwargs) 

128 except ConnectionError: 

129 wait = (2 ** i) + (random.randint(0, 1000) / 1000) 

130 logger.warning(f"Failed to connect, pausing {wait} s and retrying") 

131 time.sleep(wait) 

132 logger.error("Failed to fetch.") 

133 raise ConnectionError("Max retries exceeded.") 

134 

135 

136def fetch_data(outdir: Union[Path, str] = ".", get_data: bool = True, get_img: bool = False, **params): 

137 """ 

138 Fetch data from the MCD and save in outdir. 

139 

140 Keyword arguments (other 

141 than `outdir`) will override the defaults in `base_params`. 

142 

143 Args: 

144 outdir (Union[Path, str]): dir to save in (Default value = ".") 

145 get_data (bool): get data or not (Default value = True) 

146 get_img (bool): get img or not (Default value = False) 

147 **params: Parameters to override. 

148 

149 Raises: 

150 FetchingError: Failed to fetch requested data. 

151 

152 Returns: 

153 (Path): output file. 

154 

155 Call this function to retrieve data from the server and save it in a file. 

156 Keyword arguments passed here will override the defaults in `base_params`, 

157 e.g.: 

158 

159 ```python 

160 >> fetch_data(ls=0.5, localtime=1).dataf 

161 Path("marsdata_ls_0.5-localtime_1.txt") 

162 ``` 

163 For more information on any particular parameter see the web interface. 

164 """ 

165 p = base_params.copy() 

166 p.update(params) 

167 logger.info("Fetching page") 

168 r = get(url, params=p) 

169 if "Ooops!" in r.text: 

170 raise FetchingError(f"Failed to download, server said {r.text}") 

171 print(r, r.text) 

172 soup = BeautifulSoup(r.text, features="html.parser") 

173 if isinstance(outdir, str): 

174 outdir = Path(outdir).expanduser().resolve() 

175 

176 dataf, imgf = None, None 

177 

178 if get_data: 

179 data_url = urlbase + soup.body.a["href"].replace("../", "") 

180 logger.info(f"Fetching ascii data from {data_url}") 

181 r = get(data_url) 

182 dataf = outdir / generate_fn(**params) 

183 with dataf.open("w") as f: 

184 f.write(r.text) 

185 

186 if get_img: 

187 img_url = urlbase + soup.body.img["src"].replace("../", "") 

188 logger.info(f"Fetching img from {img_url}") 

189 r = get(img_url) 

190 imgf = (outdir / generate_fn(**params)).with_suffix(".png") 

191 with imgf.open("wb") as im: 

192 im.write(r.content) 

193 

194 return _FetchedFiles(dataf, imgf)