# -*- coding: utf-8 -*- from pytrends.request import TrendReq from GoogleNews import GoogleNews from bs4 import BeautifulSoup from datetime import datetime import urllib.request import time as tm import json import logging import math import re import sys sys.path.append('/var/www/controlpanel-api/lib/') def getNews(keyword, url_num=16, limit=16, news_num=10, pause=2.0): """ @param list keyword @param int purl_num @param int limit @param int news_num @param float pause @return dict data e.g. {'news':[{'url':'url1', 'title':'title1'},,,{'url':'urlN', 'title':'titleN'}]} """ logging.debug("getNews() in") results = [] try: opener = urllib.request.build_opener() opener.addheaders = [('User-Agent', 'Mozilla/5.0')] # urls = search(keyword, lang="jp", tpe="nws", num=url_num, stop=limit, pause=pause) # urls = search(keyword, tld="jp", lang="jp", # num=url_num, stop=limit, pause=pause) googlenews = GoogleNews(lang="jp") googlenews.search(keyword) urls = googlenews.get_links() logging.debug(urls) for x in urls: try: if len(results) == news_num: break urlopen = urllib.request.urlopen(x) soup = BeautifulSoup(urlopen, features="lxml") title = soup.title.string if title != 'Request Rejected': logging.debug(str(title)) logging.debug(x) results.append(dict(url=x, title=title)) except Exception as e: logging.debug('getNews() error') logging.debug(str(e), exc_info=True) pass if len(results) == 0: results.append(dict(url='No Data', title='No Data')) data = dict(news=results) logging.debug("getNews() out") logging.debug(data) return data except Exception as e: logging.debug('getNews() error') logging.debug(str(e), exc_info=True) data = dict(news=results) if len(results) != 0 else dict(news=None) logging.debug("getNews() out") return data def getRelatedWords(keyword, interest_term="today 1-m"): """ @param list keyword @param string interest_term @return list data e.g. [{'word':'工場 建設', 'value':11, 'keyword':'工場'}, {'word':'工場 炎上', 'value':13, 'keyword':'工場'}] """ data = [] # try: # keyword = keyword.decode('utf-8') # pytrendで得られるデータのキーはunicode. # except Exception as e: # logging.debug('getRelatedWords() error(1)') # logging.debug(str(e), exc_info=True) # data.append(dict(word=None, value=None, keyword=keyword)) # return data # pytrend = TrendReq('', '', custom_useragent='My Pytrends Script') pytrend = TrendReq(hl='ja-JP', tz=360) pytrend.build_payload(kw_list=[keyword], timeframe=interest_term) try: related_results = pytrend.related_queries() except Exception as e: logging.debug('getRelatedWords() error(2)') logging.debug(str(e), exc_info=True) data.append(dict(word=None, value=None, keyword=keyword)) return data try: raw_top = json.loads( related_results[keyword]['top'].to_json(orient='values')) except Exception as e: logging.debug('getRelatedWords() error(3)') logging.debug(str(e), exc_info=True) data.append(dict(word=None, value=None, keyword=keyword)) return data ret = {} for i in raw_top[:5]: # [{'word':'aaa', 'value':00},,,{'word':'zzz', 'value':99}] data.append(dict(word=i[0], value=i[1], keyword=keyword)) logging.debug(data) return data def getSomeRelatedWords(keywords, interest_term="now 7-d"): """ @param list keywords @param string interest_term (default value is "now 7-d") @return dict results e.g. {'keyword1':{'data':[]}}, {'keyword2':{'data':[]}}, {keyword3':{'data':[]}} """ logging.debug('getSomeRelatedWords() in') skey_num = len(keywords) logging.debug(keywords) try: if keywords: all_relateds = {} for keyword in keywords: # {'keyword1': [{'word':'aaa', 'value':00},,,{'word':'zzz', 'value':99}], 'keyword2':data,,,} all_relateds[keyword] = getRelatedWords(keyword, interest_term) ret = {} if skey_num == 1: ret["all_keywords"] = all_relateds[keywords[0]] return ret elif skey_num > 1: top_related_summary = [] all_relateds_num = 0 exist_data_keys = [ keyword for keyword in keywords if all_relateds[keyword][0]['word']] exist_data_key_num = len(exist_data_keys) if exist_data_key_num > 1: if exist_data_key_num == 2: tmp_related_num = 4 elif exist_data_key_num == 3: tmp_related_num = 3 elif exist_data_key_num == 4: tmp_related_num = 2 exist_data_keys.sort() for keyword in exist_data_keys: [top_related_summary.append( rw) for rw in all_relateds[keyword][:tmp_related_num]] all_relateds['all_keywords'] = top_related_summary[:8] elif exist_data_key_num == 1: all_relateds['all_keywords'] = all_relateds[exist_data_keys[0]] else: all_relateds['all_keywords'] = [ dict(word=None, value=None, keyword=keywords)] if all_relateds == None: all_relateds['all_keywords'] = [ dict(word=None, value=None, keyword=keywords)] logging.debug('getSomeRelatedWord() out') ret = all_relateds logging.debug(ret) return ret if len(ret.keys()) == 0: logging.debug('getSomeRelatedWords() out') ret['all_keywords'] = [ dict(word=None, value=None, keyword=keywords)] return ret except Exception as e: logging.debug('getSomeRelatedWords() error') logging.debug(str(e), exc_info=True) ret = {} ret['all_keywords'] = [dict(word=None, value=None, keyword=keywords)] return ret def getOvertime(keywords, interest_term): """ @param list keywords @param string interest_term @return dict ret e.g. {data:[{'date': '2017-08-13', u'keyword1': 36, u'keyword2': 45}, {'date': '2017-08-14', u'keyword1': 40, u'keyword2': 40}], label:{'date', 'keyword1', 'keyword2'}} """ logging.debug(f'keyword: {keywords}') pytrend = TrendReq() pytrend.build_payload(kw_list=keywords, timeframe=interest_term) ret = {} try: results_overtime = pytrend.interest_over_time() logging.debug(results_overtime) except Exception as e: logging.debug('getOvertime() error') logging.debug(str(e), exc_info=True) ret = dict(data=None, label=None) return ret raw_overtime = json.loads(results_overtime.T.to_json(date_format="iso")) ot_dates = list(raw_overtime.keys()) tmp_ot_data = raw_overtime for d in ot_dates: strDate = d[:10] tmp_ot_data[d].update({'date': strDate}) ot_data = tmp_ot_data.values() # logging.debug(ot_data) ot_labels = tmp_ot_data[ot_dates[0]].keys() # ot_labels = tmp_ot_data.keys() # logging.debug(ot_labels) try: # calculate time delta is_date = re.search("[0-9]{4}\-[0-9]{1,2}\-[0-9]{1,2}", interest_term) if is_date: term_list = interest_term.split(' ') # '2017-08-09 2017-08-31' dt_delta = datetime.strptime( term_list[1], '%Y-%m-%d') - datetime.strptime(term_list[0], '%Y-%m-%d') if int(math.fabs(dt_delta.days)) <= 28: tmp_last_day = datetime.strptime(max(ot_dates)[:10], '%Y-%m-%d') last_day_unix = int(tmp_last_day.timestamp()) # ot_dates; unixtimestamp last_day = datetime.fromtimestamp(last_day_unix) str_last_day = last_day.strftime('%Y-%m-%d') today = datetime.today() str_today = today.strftime('%Y-%m-%d') num_of_shortage_data = int((today - last_day).days) if num_of_shortage_data > 0: # データが不足している日か存在したら. pytrend.build_payload( kw_list=keywords, timeframe='now 7-d') results_overtime = pytrend.interest_over_time() logging.debug(results_overtime) ot_means = {} for i in range(num_of_shortage_data+1): # 不足分のデータを取得する day_start = tm.mktime( (last_day.year, last_day.month, last_day.day + i, 0, 0, 0, 0, 0, 0)) logging.debug(day_start) day_end = tm.mktime( (last_day.year, last_day.month, last_day.day + i, 23, 59, 0, 0, 0, 0)) logging.debug(day_end) str_day = datetime.fromtimestamp( day_start).strftime("%Y-%m-%d") # val_mean = json.loads(results_overtime.loc[unicode(day_start):unicode(day_end)].mean().to_json()) val_mean = json.loads(results_overtime.loc[day_start:day_end].mean().to_json()) # {'2017-09-05': {'word_1': 10, 'word_2': 80}} ot_means[str_day] = val_mean ratios = {} try: u_keywords = [key.decode() for key in keywords] except: u_keywords = [key for key in keywords] for keyword in u_keywords: # if tmp_ot_data[unicode(last_day_unix)][keyword] != 0 and ot_means[str_last_day][keyword] != 0: if tmp_ot_data[last_day_unix][keyword] != 0 and ot_means[str_last_day][keyword] != 0: # ratios[keyword] = tmp_ot_data[unicode(last_day_unix)][keyword] / ot_means[str_last_day][keyword] ratios[keyword] = tmp_ot_data[last_day_unix][keyword] / ot_means[str_last_day][keyword] else: ratios[keyword] = 0 for ot_day in ot_means.keys(): if not ot_day == str_last_day: tmp_ot_dict = {} tmp_ot_dict['date'] = ot_day for keyword in u_keywords: # if tmp_ot_data[unicode(last_day_unix)][keyword] == 0: if tmp_ot_data[last_day_unix][keyword] == 0: tmp_val = ot_means[ot_day][keyword] elif ot_means[ot_day][keyword] != 0 and ratios[keyword] != 0: tmp_val = int( round(ot_means[ot_day][keyword] * ratios[keyword])) tmp_val = tmp_val if tmp_val <= 100 else 100 else: tmp_val = 0 tmp_ot_dict[keyword] = tmp_val # {'date': '2017-08-13', u'\u30b5\u30fc\u30d0\u30fc': 36, u'ntt': 45} ot_data.append(tmp_ot_dict) except Exception as e: logging.debug('getOvertime() error') logging.debug(str(e), exc_info=True) if ot_data: logging.debug('error: ret=ot_data') ret = dict(data=ot_data, label=ot_labels) else: logging.debug('error: ret=None') ret = dict(data=None, label=None) return ret logging.debug('OK: ret=ot_data') ret = dict(data=ot_data, label=ot_labels) return ret def getOvertimes(keywords, interest_term="today 1-m"): """ @param list keywords @param string interest_term (default value is "today 1-m") @return dict results e.g. {'keyword1':{'data':[]}}, {'keyword2':{'data':[]}}, {keyword3':{'data':[]}} """ results = {} key_num = len(keywords) try: if key_num > 1: for keyword in keywords: # {'keyword1': [{'date':0000-00-00, 'key1':00},{},,,,,{}], 'keyword2':data,,,} results[keyword] = getOvertime([keyword], interest_term) results["all_keywords"] = getOvertime(keywords, interest_term) elif key_num == 1: results["all_keywords"] = getOvertime(keywords, interest_term) except Exception as e: logging.debug('getOvertimes() error') logging.debug(str(e), exc_info=True) results["all_keywords"] = dict(data=None, label=None) # logging.debug(results) return results