| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117 |
- # Licensed to the Apache Software Foundation (ASF) under one or more
- # contributor license agreements. See the NOTICE file distributed with
- # this work for additional information regarding copyright ownership.
- # The ASF licenses this file to You under the Apache License, Version 2.0
- # (the "License"); you may not use this file except in compliance with
- # the License. You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """
- This will generate a movie data set of 1100 records.
- These are the first 1100 movies which appear when querying the Freebase of type '/film/film'.
- Here is the link to the freebase page - https://www.freebase.com/film/film?schema=
- Usage - python3 film_data_generator.py
- """
- import csv
- import copy
- import json
- import codecs
- import datetime
- import urllib.parse
- import urllib.request
- import xml.etree.cElementTree as ET
- from xml.dom import minidom
- MAX_ITERATIONS=10 #10 limits it to 1100 docs
- # You need an API Key by Google to run this
- API_KEY = '<insert your Google developer API key>'
- service_url = 'https://www.googleapis.com/freebase/v1/mqlread'
- query = [{
- "id": None,
- "name": None,
- "initial_release_date": None,
- "directed_by": [],
- "genre": [],
- "type": "/film/film",
- "initial_release_date>" : "2000"
- }]
- def gen_csv(filmlist):
- filmlistDup = copy.deepcopy(filmlist)
- #Convert multi-valued to % delimited string
- for film in filmlistDup:
- for key in film:
- if isinstance(film[key], list):
- film[key] = '|'.join(film[key])
- keys = ['name', 'directed_by', 'genre', 'type', 'id', 'initial_release_date']
- with open('films.csv', 'w', newline='', encoding='utf8') as csvfile:
- dict_writer = csv.DictWriter(csvfile, keys)
- dict_writer.writeheader()
- dict_writer.writerows(filmlistDup)
- def gen_json(filmlist):
- filmlistDup = copy.deepcopy(filmlist)
- with open('films.json', 'w') as jsonfile:
- jsonfile.write(json.dumps(filmlist, indent=2))
- def gen_xml(filmlist):
- root = ET.Element("add")
- for film in filmlist:
- doc = ET.SubElement(root, "doc")
- for key in film:
- if isinstance(film[key], list):
- for value in film[key]:
- field = ET.SubElement(doc, "field")
- field.set("name", key)
- field.text=value
- else:
- field = ET.SubElement(doc, "field")
- field.set("name", key)
- field.text=film[key]
- tree = ET.ElementTree(root)
- with open('films.xml', 'w') as f:
- f.write( minidom.parseString(ET.tostring(tree.getroot(),'utf-8')).toprettyxml(indent=" ") )
- def do_query(filmlist, cursor=""):
- params = {
- 'query': json.dumps(query),
- 'key': API_KEY,
- 'cursor': cursor
- }
- url = service_url + '?' + urllib.parse.urlencode(params)
- data = urllib.request.urlopen(url).read().decode('utf-8')
- response = json.loads(data)
- for item in response['result']:
- del item['type'] # It's always /film/film. No point of adding this.
- try:
- datetime.datetime.strptime(item['initial_release_date'], "%Y-%m-%d")
- except ValueError:
- #Date time not formatted properly. Keeping it simple by removing the date field from that doc
- del item['initial_release_date']
- filmlist.append(item)
- return response.get("cursor")
- if __name__ == "__main__":
- filmlist = []
- cursor = do_query(filmlist)
- i=0
- while(cursor):
- cursor = do_query(filmlist, cursor)
- i = i+1
- if i==MAX_ITERATIONS:
- break
- gen_json(filmlist)
- gen_csv(filmlist)
- gen_xml(filmlist)
|