film_data_generator.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. # Licensed to the Apache Software Foundation (ASF) under one or more
  2. # contributor license agreements. See the NOTICE file distributed with
  3. # this work for additional information regarding copyright ownership.
  4. # The ASF licenses this file to You under the Apache License, Version 2.0
  5. # (the "License"); you may not use this file except in compliance with
  6. # the License. You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """
  16. This will generate a movie data set of 1100 records.
  17. These are the first 1100 movies which appear when querying the Freebase of type '/film/film'.
  18. Here is the link to the freebase page - https://www.freebase.com/film/film?schema=
  19. Usage - python3 film_data_generator.py
  20. """
  21. import csv
  22. import copy
  23. import json
  24. import codecs
  25. import datetime
  26. import urllib.parse
  27. import urllib.request
  28. import xml.etree.cElementTree as ET
  29. from xml.dom import minidom
  30. MAX_ITERATIONS=10 #10 limits it to 1100 docs
  31. # You need an API Key by Google to run this
  32. API_KEY = '<insert your Google developer API key>'
  33. service_url = 'https://www.googleapis.com/freebase/v1/mqlread'
  34. query = [{
  35. "id": None,
  36. "name": None,
  37. "initial_release_date": None,
  38. "directed_by": [],
  39. "genre": [],
  40. "type": "/film/film",
  41. "initial_release_date>" : "2000"
  42. }]
  43. def gen_csv(filmlist):
  44. filmlistDup = copy.deepcopy(filmlist)
  45. #Convert multi-valued to % delimited string
  46. for film in filmlistDup:
  47. for key in film:
  48. if isinstance(film[key], list):
  49. film[key] = '|'.join(film[key])
  50. keys = ['name', 'directed_by', 'genre', 'type', 'id', 'initial_release_date']
  51. with open('films.csv', 'w', newline='', encoding='utf8') as csvfile:
  52. dict_writer = csv.DictWriter(csvfile, keys)
  53. dict_writer.writeheader()
  54. dict_writer.writerows(filmlistDup)
  55. def gen_json(filmlist):
  56. filmlistDup = copy.deepcopy(filmlist)
  57. with open('films.json', 'w') as jsonfile:
  58. jsonfile.write(json.dumps(filmlist, indent=2))
  59. def gen_xml(filmlist):
  60. root = ET.Element("add")
  61. for film in filmlist:
  62. doc = ET.SubElement(root, "doc")
  63. for key in film:
  64. if isinstance(film[key], list):
  65. for value in film[key]:
  66. field = ET.SubElement(doc, "field")
  67. field.set("name", key)
  68. field.text=value
  69. else:
  70. field = ET.SubElement(doc, "field")
  71. field.set("name", key)
  72. field.text=film[key]
  73. tree = ET.ElementTree(root)
  74. with open('films.xml', 'w') as f:
  75. f.write( minidom.parseString(ET.tostring(tree.getroot(),'utf-8')).toprettyxml(indent=" ") )
  76. def do_query(filmlist, cursor=""):
  77. params = {
  78. 'query': json.dumps(query),
  79. 'key': API_KEY,
  80. 'cursor': cursor
  81. }
  82. url = service_url + '?' + urllib.parse.urlencode(params)
  83. data = urllib.request.urlopen(url).read().decode('utf-8')
  84. response = json.loads(data)
  85. for item in response['result']:
  86. del item['type'] # It's always /film/film. No point of adding this.
  87. try:
  88. datetime.datetime.strptime(item['initial_release_date'], "%Y-%m-%d")
  89. except ValueError:
  90. #Date time not formatted properly. Keeping it simple by removing the date field from that doc
  91. del item['initial_release_date']
  92. filmlist.append(item)
  93. return response.get("cursor")
  94. if __name__ == "__main__":
  95. filmlist = []
  96. cursor = do_query(filmlist)
  97. i=0
  98. while(cursor):
  99. cursor = do_query(filmlist, cursor)
  100. i = i+1
  101. if i==MAX_ITERATIONS:
  102. break
  103. gen_json(filmlist)
  104. gen_csv(filmlist)
  105. gen_xml(filmlist)