# This file is part of EbookLib.
# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
#
# EbookLib is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# EbookLib is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with EbookLib. If not, see <http://www.gnu.org/licenses/>.
import zipfile
import six
import logging
import uuid
import warnings
import posixpath as zip_path
import os.path
from collections import OrderedDict
try:
from urllib.parse import unquote
except ImportError:
from urllib import unquote
from lxml import etree
import ebooklib
from ebooklib.utils import parse_string, parse_html_string, guess_type, get_pages_for_items
# Version of EPUB library
VERSION = (0, 17, 1)
NAMESPACES = {'XML': 'http://www.w3.org/XML/1998/namespace',
'EPUB': 'http://www.idpf.org/2007/ops',
'DAISY': 'http://www.daisy.org/z3986/2005/ncx/',
'OPF': 'http://www.idpf.org/2007/opf',
'CONTAINERNS': 'urn:oasis:names:tc:opendocument:xmlns:container',
'DC': 'http://purl.org/dc/elements/1.1/',
'XHTML': 'http://www.w3.org/1999/xhtml'}
# XML Templates
CONTAINER_PATH = 'META-INF/container.xml'
CONTAINER_XML = '''<?xml version="1.0" encoding="utf-8"?>
<container xmlns="urn:oasis:names:tc:opendocument:xmlns:container" version="1.0">
<rootfiles>
<rootfile media-type="application/oebps-package+xml" full-path="%(folder_name)s/content.opf"/>
</rootfiles>
</container>
'''
NCX_XML = six.b('''<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" />''')
NAV_XML = six.b('''<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"/>''')
CHAPTER_XML = six.b('''<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/#"></html>''')
COVER_XML = six.b('''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" lang="en" xml:lang="en">
<head>
<style>
body { margin: 0em; padding: 0em; }
img { max-width: 100%; max-height: 100%; }
</style>
</head>
<body>
<img src="" alt="" />
</body>
</html>''')
IMAGE_MEDIA_TYPES = ['image/jpeg', 'image/jpg', 'image/png', 'image/svg+xml']
# TOC and navigation elements
class Section(object):
def __init__(self, title, href=''):
self.title = title
self.href = href
class Link(object):
def __init__(self, href, title, uid=None):
self.href = href
self.title = title
self.uid = uid
# Exceptions
class EpubException(Exception):
def __init__(self, code, msg):
self.code = code
self.msg = msg
def __str__(self):
return repr(self.msg)
# Items
class EpubItem(object):
"""
Base class for the items in a book.
"""
def __init__(self, uid=None, file_name='', media_type='', content=six.b(''), manifest=True):
"""
:Args:
- uid: Unique identifier for this item (optional)
- file_name: File name for this item (optional)
- media_type: Media type for this item (optional)
- content: Content for this item (optional)
- manifest: Manifest for this item (optional)
"""
self.id = uid
self.file_name = file_name
self.media_type = media_type
self.content = content
self.is_linear = True
self.manifest = manifest
self.book = None
def get_id(self):
"""
Returns unique identifier for this item.
:Returns:
Returns uid number as string.
"""
return self.id
def get_name(self):
"""
Returns name for this item. By default it is always file name but it does not have to be.
:Returns:
Returns file name for this item.
"""
return self.file_name
def get_type(self):
"""
Guess type according to the file extension. Might not be the best way how to do it, but it works for now.
Items can be of type:
- ITEM_UNKNOWN = 0
- ITEM_IMAGE = 1
- ITEM_STYLE = 2
- ITEM_SCRIPT = 3
- ITEM_NAVIGATION = 4
- ITEM_VECTOR = 5
- ITEM_FONT = 6
- ITEM_VIDEO = 7
- ITEM_AUDIO = 8
- ITEM_DOCUMENT = 9
- ITEM_COVER = 10
We map type according to the extensions which are defined in ebooklib.EXTENSIONS.
:Returns:
Returns type of the item as number.
"""
_, ext = zip_path.splitext(self.get_name())
ext = ext.lower()
for uid, ext_list in six.iteritems(ebooklib.EXTENSIONS):
if ext in ext_list:
return uid
return ebooklib.ITEM_UNKNOWN
def get_content(self, default=six.b('')):
"""
Returns content of the item. Content should be of type 'str' (Python 2) or 'bytes' (Python 3)
:Args:
- default: Default value for the content if it is not already defined.
:Returns:
Returns content of the item.
"""
return self.content or default
def set_content(self, content):
"""
Sets content value for this item.
:Args:
- content: Content value
"""
self.content = content
def __str__(self):
return '<EpubItem:%s>' % self.id
class EpubNcx(EpubItem):
"Represents Navigation Control File (NCX) in the EPUB."
def __init__(self, uid='ncx', file_name='toc.ncx'):
super(EpubNcx, self).__init__(uid=uid, file_name=file_name, media_type='application/x-dtbncx+xml')
def __str__(self):
return '<EpubNcx:%s>' % self.id
class EpubCover(EpubItem):
"""
Represents Cover image in the EPUB file.
"""
def __init__(self, uid='cover-img', file_name=''):
super(EpubCover, self).__init__(uid=uid, file_name=file_name)
def get_type(self):
return ebooklib.ITEM_COVER
def __str__(self):
return '<EpubCover:%s:%s>' % (self.id, self.file_name)
class EpubHtml(EpubItem):
"""
Represents HTML document in the EPUB file.
"""
_template_name = 'chapter'
def __init__(self, uid=None, file_name='', media_type='', content=None, title='',
lang=None, direction=None, media_overlay=None, media_duration=None):
super(EpubHtml, self).__init__(uid, file_name, media_type, content)
self.title = title
self.lang = lang
self.direction = direction
self.media_overlay = media_overlay
self.media_duration = media_duration
self.links = []
self.properties = []
self.pages = []
def is_chapter(self):
"""
Returns if this document is chapter or not.
:Returns:
Returns book value.
"""
return True
def get_type(self):
"""
Always returns ebooklib.ITEM_DOCUMENT as type of this document.
:Returns:
Always returns ebooklib.ITEM_DOCUMENT
"""
return ebooklib.ITEM_DOCUMENT
def set_language(self, lang):
"""
Sets language for this book item. By default it will use language of the book but it
can be overwritten with this call.
"""
self.lang = lang
def get_language(self):
"""
Get language code for this book item. Language of the book item can be different from
the language settings defined globaly for book.
:Returns:
As string returns language code.
"""
return self.lang
def add_link(self, **kwgs):
"""
Add additional link to the document. Links will be embeded only inside of this document.
>>> add_link(href='styles.css', rel='stylesheet', type='text/css')
"""
self.links.append(kwgs)
if kwgs.get('type') == 'text/javascript':
if 'scripted' not in self.properties:
self.properties.append('scripted')
def get_links(self):
"""
Returns list of additional links defined for this document.
:Returns:
As tuple return list of links.
"""
return (link for link in self.links)
def get_links_of_type(self, link_type):
"""
Returns list of additional links of specific type.
:Returns:
As tuple returns list of links.
"""
return (link for link in self.links if link.get('type', '') == link_type)
def add_item(self, item):
"""
Add other item to this document. It will create additional links according to the item type.
:Args:
- item: item we want to add defined as instance of EpubItem
"""
if item.get_type() == ebooklib.ITEM_STYLE:
self.add_link(href=item.get_name(), rel='stylesheet', type='text/css')
if item.get_type() == ebooklib.ITEM_SCRIPT:
self.add_link(src=item.get_name(), type='text/javascript')
def get_body_content(self):
"""
Returns content of BODY element for this HTML document. Content will be of type 'str' (Python 2)
or 'bytes' (Python 3).
:Returns:
Returns content of this document.
"""
try:
html_tree = parse_html_string(self.content)
except:
return ''
html_root = html_tree.getroottree()
if len(html_root.find('body')) != 0:
body = html_tree.find('body')
tree_str = etree.tostring(body, pretty_print=True, encoding='utf-8', xml_declaration=False)
# this is so stupid
if tree_str.startswith(six.b('<body>')):
n = tree_str.rindex(six.b('</body>'))
return tree_str[6:n]
return tree_str
return ''
def get_content(self, default=None):
"""
Returns content for this document as HTML string. Content will be of type 'str' (Python 2)
or 'bytes' (Python 3).
:Args:
- default: Default value for the content if it is not defined.
:Returns:
Returns content of this document.
"""
tree = parse_string(self.book.get_template(self._template_name))
tree_root = tree.getroot()
tree_root.set('lang', self.lang or self.book.language)
tree_root.attrib['{%s}lang' % NAMESPACES['XML']] = self.lang or self.book.language
# add to the head also
# <meta charset="utf-8" />
try:
html_tree = parse_html_string(self.content)
except:
return ''
html_root = html_tree.getroottree()
# create and populate head
_head = etree.SubElement(tree_root, 'head')
if self.title != '':
_title = etree.SubElement(_head, 'title')
_title.text = self.title
for lnk in self.links:
if lnk.get('type') == 'text/javascript':
_lnk = etree.SubElement(_head, 'script', lnk)
# force <script></script>
_lnk.text = ''
else:
_lnk = etree.SubElement(_head, 'link', lnk)
# this should not be like this
# head = html_root.find('head')
# if head is not None:
# for i in head.getchildren():
# if i.tag == 'title' and self.title != '':
# continue
# _head.append(i)
# create and populate body
_body = etree.SubElement(tree_root, 'body')
if self.direction:
_body.set('dir', self.direction)
tree_root.set('dir', self.direction)
body = html_tree.find('body')
if body is not None:
for i in body.getchildren():
_body.append(i)
tree_str = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
return tree_str
def __str__(self):
return '<EpubHtml:%s:%s>' % (self.id, self.file_name)
class EpubCoverHtml(EpubHtml):
"""
Represents Cover page in the EPUB file.
"""
def __init__(self, uid='cover', file_name='cover.xhtml', image_name='', title='Cover'):
super(EpubCoverHtml, self).__init__(uid=uid, file_name=file_name, title=title)
self.image_name = image_name
self.is_linear = False
def is_chapter(self):
"""
Returns if this document is chapter or not.
:Returns:
Returns book value.
"""
return False
def get_content(self):
"""
Returns content for cover page as HTML string. Content will be of type 'str' (Python 2) or 'bytes' (Python 3).
:Returns:
Returns content of this document.
"""
self.content = self.book.get_template('cover')
tree = parse_string(super(EpubCoverHtml, self).get_content())
tree_root = tree.getroot()
images = tree_root.xpath('//xhtml:img', namespaces={'xhtml': NAMESPACES['XHTML']})
images[0].set('src', self.image_name)
images[0].set('alt', self.title)
tree_str = etree.tostring(tree, pretty_print=True, encoding='utf-8', xml_declaration=True)
return tree_str
def __str__(self):
return '<EpubCoverHtml:%s:%s>' % (self.id, self.file_name)
class EpubNav(EpubHtml):
"""
Represents Navigation Document in the EPUB file.
"""
def __init__(self, uid='nav', file_name='nav.xhtml', media_type='application/xhtml+xml', title=''):
super(EpubNav, self).__init__(uid=uid, file_name=file_name, media_type=media_type, title=title)
def is_chapter(self):
"""
Returns if this document is chapter or not.
:Returns:
Returns book value.
"""
return False
def __str__(self):
return '<EpubNav:%s:%s>' % (self.id, self.file_name)
class EpubImage(EpubItem):
"""
Represents Image in the EPUB file.
"""
def __init__(self, *args, **kwargs):
super(EpubImage, self).__init__(*args, **kwargs)
def get_type(self):
return ebooklib.ITEM_IMAGE
def __str__(self):
return '<EpubImage:%s:%s>' % (self.id, self.file_name)
class EpubSMIL(EpubItem):
def __init__(self, uid=None, file_name='', content=None):
super(EpubSMIL, self).__init__(uid=uid, file_name=file_name, media_type='application/smil+xml', content=content)
def get_type(self):
return ebooklib.ITEM_SMIL
def __str__(self):
return '<EpubSMIL:%s:%s>' % (self.id, self.file_name)
# EpubBook
class EpubBook(object):
def __init__(self):
self.EPUB_VERSION = None
self.reset()
# we should have options here
def reset(self):
"Initialises all needed variables to default values"
self.metadata = {}
self.items = []
self.spine = []
self.guide = []
self.pages = []
self.toc = []
self.bindings = []
self.IDENTIFIER_ID = 'id'
self.FOLDER_NAME = 'EPUB'
self._id_html = 0
self._id_image = 0
self._id_static = 0
self.title = ''
self.language = 'en'
self.direction = None
self.templates = {
'ncx': NCX_XML,
'nav': NAV_XML,
'chapter': CHAPTER_XML,
'cover': COVER_XML
}
self.add_metadata('OPF', 'generator', '', {
'name': 'generator', 'content': 'Ebook-lib %s' % '.'.join([str(s) for s in VERSION])
})
# default to using a randomly-unique identifier if one is not specified manually
self.set_identifier(str(uuid.uuid4()))
# custom prefixes and namespaces to be set to the content.opf doc
self.prefixes = []
self.namespaces = {}
def set_identifier(self, uid):
"""
Sets unique id for this epub
:Args:
- uid: Value of unique identifier for this book
"""
self.uid = uid
self.set_unique_metadata('DC', 'identifier', self.uid, {'id': self.IDENTIFIER_ID})
def set_title(self, title):
"""
Set title. You can set multiple titles.
:Args:
- title: Title value
"""
self.title = title
self.add_metadata('DC', 'title', self.title)
def set_language(self, lang):
"""
Set language for this epub. You can set multiple languages. Specific items in the book can have
different language settings.
:Args:
- lang: Language code
"""
self.language = lang
self.add_metadata('DC', 'language', lang)
def set_direction(self, direction):
"""
:Args:
- direction: Options are "ltr", "rtl" and "default"
"""
self.direction = direction
def set_cover(self, file_name, content, create_page=True):
"""
Set cover and create cover document if needed.
:Args:
- file_name: file name of the cover page
- content: Content for the cover image
- create_page: Should cover page be defined. Defined as bool value (optional). Default value is True.
"""
# as it is now, it can only be called once
c0 = EpubCover(file_name=file_name)
c0.content = content
self.add_item(c0)
if create_page:
c1 = EpubCoverHtml(image_name=file_name)
self.add_item(c1)
self.add_metadata(None, 'meta', '', OrderedDict([('name', 'cover'), ('content', 'cover-img')]))
def add_author(self, author, file_as=None, role=None, uid='creator'):
"Add author for this document"
self.add_metadata('DC', 'creator', author, {'id': uid})
if file_as:
self.add_metadata(None, 'meta', file_as, {'refines': '#' + uid,
'property': 'file-as',
'scheme': 'marc:relators'})
if role:
self.add_metadata(None, 'meta', role, {'refines': '#' + uid,
'property': 'role',
'scheme': 'marc:relators'})
def add_metadata(self, namespace, name, value, others=None):
"Add metadata"
if namespace in NAMESPACES:
namespace = NAMESPACES[namespace]
if namespace not in self.metadata:
self.metadata[namespace] = {}
if name not in self.metadata[namespace]:
self.metadata[namespace][name] = []
self.metadata[namespace][name].append((value, others))
def get_metadata(self, namespace, name):
"Retrieve metadata"
...
https://github.com/aerkalov/ebooklib/blob/master/ebooklib/epub.py
你好,我是有问必答小助手,非常抱歉,本次您提出的有问必答问题,技术专家团超时未为您做出解答
本次提问扣除的有问必答次数,已经为您补发到账户,我们后续会持续优化,扩大我们的服务范围,为您带来更好地服务。