Charlotte 🦝 Delenk
e56b4e9cd8
All checks were successful
Hydra plover-plugin-emoji Hydra build #48596 of nix-packages:x86_64-linux-master-pr262:plover-plugin-emoji
Hydra hydra Hydra build #48597 of nix-packages:x86_64-linux-master-pr262:hydra
Hydra mautrix-signal Hydra build #48598 of nix-packages:x86_64-linux-master-pr262:mautrix-signal
Hydra woodpecker-agent Hydra build #48599 of nix-packages:x86_64-linux-master-pr262:woodpecker-agent
Hydra alco-sans Hydra build #48600 of nix-packages:x86_64-linux-master-pr262:alco-sans
Hydra hydra-unstable Hydra build #48601 of nix-packages:x86_64-linux-master-pr262:hydra-unstable
Hydra python-rtf-tokenize Hydra build #48603 of nix-packages:x86_64-linux-master-pr262:python-rtf-tokenize
Hydra emoji-volpeon-blobfox Hydra build #48604 of nix-packages:x86_64-linux-master-pr262:emoji-volpeon-blobfox
Hydra papermc Hydra build #48605 of nix-packages:x86_64-linux-master-pr262:papermc
Hydra element-web Hydra build #48606 of nix-packages:x86_64-linux-master-pr262:element-web
Hydra fairfax Hydra build #48607 of nix-packages:x86_64-linux-master-pr262:fairfax
Hydra wordpress-plugins.modern-footnotes Hydra build #48608 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.modern-footnotes
Hydra wordpress-plugins.webfinger Hydra build #48609 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.webfinger
Hydra constructium Hydra build #48610 of nix-packages:x86_64-linux-master-pr262:constructium
Hydra emoji-lotte Hydra build #48611 of nix-packages:x86_64-linux-master-pr262:emoji-lotte
Hydra wordpress-plugins.hum Hydra build #48612 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.hum
Hydra kubo Hydra build #48613 of nix-packages:x86_64-linux-master-pr262:kubo
Hydra wordpress-plugins.indieweb-post-kinds Hydra build #48614 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.indieweb-post-kinds
Hydra plover-plugin-machine-hid Hydra build #48615 of nix-packages:x86_64-linux-master-pr262:plover-plugin-machine-hid
Hydra lotte-art Hydra build #48616 of nix-packages:x86_64-linux-master-pr262:lotte-art
Hydra miifox-net Hydra build #48617 of nix-packages:x86_64-linux-master-pr262:miifox-net
Hydra wordpress-plugins.wp-super-cache Hydra build #48618 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.wp-super-cache
Hydra plover-plugin-yaml-dictionary Hydra build #48619 of nix-packages:x86_64-linux-master-pr262:plover-plugin-yaml-dictionary
Hydra woodpecker-server Hydra build #48620 of nix-packages:x86_64-linux-master-pr262:woodpecker-server
Hydra wordpress-plugins.pubsubhubbub Hydra build #48621 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.pubsubhubbub
Hydra emoji-volpeon-vlpn Hydra build #48622 of nix-packages:x86_64-linux-master-pr262:emoji-volpeon-vlpn
Hydra wordpress-plugins.polylang Hydra build #48623 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.polylang
Hydra wordpress-plugins.syndication-links Hydra build #48624 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.syndication-links
Hydra old-homepage Hydra build #48625 of nix-packages:x86_64-linux-master-pr262:old-homepage
Hydra woodpecker-frontend Hydra build #48626 of nix-packages:x86_64-linux-master-pr262:woodpecker-frontend
Hydra wordpress-plugins.ilab-media-tools Hydra build #48627 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.ilab-media-tools
Hydra wordpress-plugins.activitypub Hydra build #48628 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.activitypub
Hydra wordpress-plugins.webmention Hydra build #48629 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.webmention
Hydra python-simplefuzzyset Hydra build #48630 of nix-packages:x86_64-linux-master-pr262:python-simplefuzzyset
Hydra wordpress-plugins.webp-express Hydra build #48631 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.webp-express
Hydra plover Hydra build #48632 of nix-packages:x86_64-linux-master-pr262:plover
Hydra wordpress-plugins.wordpress-seo Hydra build #48633 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.wordpress-seo
Hydra kreative-square Hydra build #48634 of nix-packages:x86_64-linux-master-pr262:kreative-square
Hydra python-instagram Hydra build #48635 of nix-packages:x86_64-linux-master-pr262:python-instagram
Hydra admin-fe Hydra build #48636 of nix-packages:x86_64-linux-master-pr262:admin-fe
Hydra plover-plugin-rkb1-hid Hydra build #48637 of nix-packages:x86_64-linux-master-pr262:plover-plugin-rkb1-hid
Hydra emoji-volpeon-drgn Hydra build #48638 of nix-packages:x86_64-linux-master-pr262:emoji-volpeon-drgn
Hydra wordpress-plugins.modern-images-wp Hydra build #48639 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.modern-images-wp
Hydra woodpecker-cli Hydra build #48640 of nix-packages:x86_64-linux-master-pr262:woodpecker-cli
Hydra pleroma-fe Hydra build #48641 of nix-packages:x86_64-linux-master-pr262:pleroma-fe
Hydra emoji-volpeon-raccoon Hydra build #48642 of nix-packages:x86_64-linux-master-pr262:emoji-volpeon-raccoon
Hydra mautrix-cleanup Hydra build #48643 of nix-packages:x86_64-linux-master-pr262:mautrix-cleanup
Hydra fairfax-hd Hydra build #48644 of nix-packages:x86_64-linux-master-pr262:fairfax-hd
Hydra wordpress-plugins.google-sitemap-generator Hydra build #48645 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.google-sitemap-generator
Hydra emoji-volpeon-fox Hydra build #48646 of nix-packages:x86_64-linux-master-pr262:emoji-volpeon-fox
Hydra mautrix-telegram Hydra build #48647 of nix-packages:x86_64-linux-master-pr262:mautrix-telegram
Hydra wordpress-plugins.micropub Hydra build #48648 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.micropub
Hydra wordpress-themes.sempress Hydra build #48649 of nix-packages:x86_64-linux-master-pr262:wordpress-themes.sempress
Hydra matrix-media-repo Hydra build #48650 of nix-packages:x86_64-linux-master-pr262:matrix-media-repo
Hydra wordpress-plugins.nodeinfo Hydra build #48651 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.nodeinfo
Hydra emoji-volpeon-bunhd Hydra build #48653 of nix-packages:x86_64-linux-master-pr262:emoji-volpeon-bunhd
Hydra plover-plugins-manager Hydra build #48654 of nix-packages:x86_64-linux-master-pr262:plover-plugins-manager
Hydra wordpress-plugins.indieauth Hydra build #48655 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.indieauth
Hydra wordpress-plugins.classic-editor Hydra build #48656 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.classic-editor
Hydra nasin-nanpa Hydra build #48657 of nix-packages:x86_64-linux-master-pr262:nasin-nanpa
Hydra python-tulir-telethon Hydra build #48658 of nix-packages:x86_64-linux-master-pr262:python-tulir-telethon
Hydra wordpress-plugins.indieweb Hydra build #48659 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.indieweb
Hydra mautrix-whatsapp Hydra build #48660 of nix-packages:x86_64-linux-master-pr262:mautrix-whatsapp
Hydra python-plover-stroke Hydra build #48661 of nix-packages:x86_64-linux-master-pr262:python-plover-stroke
Hydra wordpress-plugins.the-plus-addons-for-block-editor Hydra build #48662 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.the-plus-addons-for-block-editor
Hydra mautrix-discord Hydra build #48663 of nix-packages:x86_64-linux-master-pr262:mautrix-discord
Hydra plover-plugin-tapey-tape Hydra build #48664 of nix-packages:x86_64-linux-master-pr262:plover-plugin-tapey-tape
Hydra emoji-volpeon-gphn Hydra build #48665 of nix-packages:x86_64-linux-master-pr262:emoji-volpeon-gphn
Hydra wordpress-plugins.jetpack Hydra build #48666 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.jetpack
Hydra emoji-volpeon-bunhd-flip Hydra build #48667 of nix-packages:x86_64-linux-master-pr262:emoji-volpeon-bunhd-flip
Hydra emoji-volpeon-blobfox-flip Hydra build #48668 of nix-packages:x86_64-linux-master-pr262:emoji-volpeon-blobfox-flip
Hydra wordpress-themes.twentytwentythree Hydra build #48669 of nix-packages:x86_64-linux-master-pr262:wordpress-themes.twentytwentythree
Hydra plover-dict-didoesdigital Hydra build #48670 of nix-packages:x86_64-linux-master-pr262:plover-dict-didoesdigital
Hydra emoji-caro Hydra build #48671 of nix-packages:x86_64-linux-master-pr262:emoji-caro
Hydra python-mautrix Hydra build #48672 of nix-packages:x86_64-linux-master-pr262:python-mautrix
Hydra wordpress-plugins.friends Hydra build #48673 of nix-packages:x86_64-linux-master-pr262:wordpress-plugins.friends
Hydra akkoma Hydra build #48534 of nix-packages:aarch64-linux-master-pr262:akkoma
Hydra fcitx5 Hydra build #48585 of nix-packages:aarch64-linux-master-pr262:fcitx5
1005 lines
27 KiB
Python
1005 lines
27 KiB
Python
#!/usr/bin/python
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# This script generates a data file containing all Unicode information needed by KCharSelect.
|
|
#
|
|
#############################################################################
|
|
# SPDX-FileCopyrightText: 2007 Daniel Laidig <d.laidig@gmx.de>
|
|
#
|
|
# SPDX-License-Identifier: LGPL-2.1-or-later
|
|
#############################################################################
|
|
#
|
|
# The current directory must contain the following files that can be found at
|
|
# http://www.unicode.org/Public/UNIDATA/:
|
|
# - UnicodeData.txt
|
|
# - Unihan_Readings.txt (you need to uncompress it from Unihan.zip)
|
|
# - NamesList.txt
|
|
# - Blocks.txt
|
|
#
|
|
# The generated file is named "kcharselect-data" and has to be put in kdelibs/kdeui/widgets/.
|
|
# Additionally a translation dummy named "kcharselect-translation.cpp" is generated and has
|
|
# to be placed in the same directory.
|
|
#
|
|
# FILE STRUCTURE
|
|
#
|
|
# The generated file is a binary file. The first 40 bytes are the header
|
|
# and contain the position of each part of the file. Each entry is uint32.
|
|
#
|
|
# pos content
|
|
# 0 names strings begin
|
|
# 4 names offsets begin
|
|
# 8 details strings begin
|
|
# 12 details offsets begin
|
|
# 16 block strings begin
|
|
# 20 block offsets begin
|
|
# 24 section strings begin
|
|
# 28 section offsets begin
|
|
# 32 unihan strings begin
|
|
# 36 unihan offsets begin
|
|
#
|
|
# The string parts always contain all strings in a row, followed by a 0x00 byte.
|
|
# There is one exception: The data for seeAlso in details is only 2 bytes (as is always is _one_
|
|
# unicode character) and _not_ followed by a 0x00 byte.
|
|
#
|
|
# The offset parts contain entries with a fixed length. Unicode characters are always uint16 and offsets uint32.
|
|
# Offsets are positions in the data file.
|
|
#
|
|
# names_offsets:
|
|
# each entry 6 bytes
|
|
# 16bit: unicode
|
|
# 32bit: offset to name in names_strings
|
|
#
|
|
# names_strings:
|
|
# the first byte is the category (same values as QChar::Category),
|
|
# directly followed by the character name (terminated by 0x00)
|
|
#
|
|
# nameslist_offsets:
|
|
# char, alias, alias_count, note, note_count, approxEquiv, approxEquiv_coutn, equiv, equiv_count, seeAlso, seeAlso_count
|
|
# 16 32 8 32 8 32 8 32 8 32 8
|
|
# => each entry 27 bytes
|
|
#
|
|
# blocks_offsets:
|
|
# each entry 4 bytes
|
|
# 16bit: start unicode
|
|
# 16bit: end unicode
|
|
# Note that there is no string offset.
|
|
#
|
|
# section_offsets:
|
|
# each entry 4 bytes
|
|
# 16bit: section offset
|
|
# 16bit: block offset
|
|
# Note that these offsets are _not_ positions in the data file but indexes.
|
|
# For example 0x0403 means the fourth section includes the third block.
|
|
#
|
|
# unihan_offsets:
|
|
# each entry 30 bytes
|
|
# 16bit: unicode
|
|
# 32bit: offset to unihan_strings for Definition
|
|
# 32bit: offset to unihan_strings for Cantonese
|
|
# 32bit: offset to unihan_strings for Mandarin
|
|
# 32bit: offset to unihan_strings for Tang
|
|
# 32bit: offset to unihan_strings for Korean
|
|
# 32bit: offset to unihan_strings for JapaneseKun
|
|
# 32bit: offset to unihan_strings for JapaneseOn
|
|
|
|
from struct import *
|
|
import sys
|
|
import re
|
|
import io
|
|
|
|
# based on http://www.unicode.org/charts/
|
|
sectiondata = '''
|
|
SECTION European Scripts
|
|
Armenian
|
|
Carian
|
|
Caucasian Albanian
|
|
Cypriot Syllabary
|
|
Cypro-Minoan
|
|
Cyrillic
|
|
Cyrillic Supplement
|
|
Cyrillic Extended-A
|
|
Cyrillic Extended-B
|
|
Cyrillic Extended-C
|
|
Cyrillic Extended-D
|
|
Elbasan
|
|
Georgian
|
|
Georgian Extended
|
|
Georgian Supplement
|
|
Glagolitic
|
|
Glagolitic Supplement
|
|
Gothic
|
|
Greek and Coptic
|
|
Greek Extended
|
|
Ancient Greek Numbers
|
|
Basic Latin
|
|
Latin-1 Supplement
|
|
Latin Extended-A
|
|
Latin Extended-B
|
|
Latin Extended-C
|
|
Latin Extended-D
|
|
Latin Extended-E
|
|
Latin Extended-F
|
|
Latin Extended-G
|
|
Latin Extended Additional
|
|
IPA Extensions
|
|
Phonetic Extensions
|
|
Phonetic Extensions Supplement
|
|
Linear A
|
|
Linear B Syllabary
|
|
Linear B Ideograms
|
|
Aegean Numbers
|
|
Lycian
|
|
Lydian
|
|
Ogham
|
|
Old Hungarian
|
|
Old Italic
|
|
Old Permic
|
|
Phaistos Disc
|
|
Runic
|
|
Shavian
|
|
Vithkuqi
|
|
|
|
SECTION Modifier Letters
|
|
Modifier Tone Letters
|
|
Spacing Modifier Letters
|
|
Superscripts and Subscripts
|
|
|
|
SECTION Combining Marks
|
|
Combining Diacritical Marks
|
|
Combining Diacritical Marks Extended
|
|
Combining Diacritical Marks Supplement
|
|
Combining Diacritical Marks for Symbols
|
|
Combining Half Marks
|
|
|
|
SECTION African Scripts
|
|
Adlam
|
|
Bamum
|
|
Bamum Supplement
|
|
Bassa Vah
|
|
Coptic
|
|
Coptic Epact Numbers
|
|
Egyptian Hieroglyphs
|
|
Egyptian Hieroglyph Format Controls
|
|
Ethiopic
|
|
Ethiopic Supplement
|
|
Ethiopic Extended
|
|
Ethiopic Extended-A
|
|
Ethiopic Extended-B
|
|
Medefaidrin
|
|
Mende Kikakui
|
|
Meroitic Cursive
|
|
Meroitic Hieroglyphs
|
|
NKo
|
|
Osmanya
|
|
Tifinagh
|
|
Vai
|
|
|
|
SECTION Middle Eastern Scripts
|
|
Anatolian Hieroglyphs
|
|
Arabic
|
|
Arabic Supplement
|
|
Arabic Extended-A
|
|
Arabic Extended-B
|
|
Arabic Extended-C
|
|
Arabic Presentation Forms-A
|
|
Arabic Presentation Forms-B
|
|
Imperial Aramaic
|
|
Avestan
|
|
Chorasmian
|
|
Carian
|
|
Cuneiform
|
|
Cuneiform Numbers and Punctuation
|
|
Early Dynastic Cuneiform
|
|
Old Persian
|
|
Ugaritic
|
|
Elymaic
|
|
Hatran
|
|
Hebrew
|
|
Mandaic
|
|
Nabataean
|
|
Old North Arabian
|
|
Old South Arabian
|
|
Inscriptional Pahlavi
|
|
Psalter Pahlavi
|
|
Palmyrene
|
|
Inscriptional Parthian
|
|
Phoenician
|
|
Samaritan
|
|
Syriac
|
|
Syriac Supplement
|
|
Yezidi
|
|
|
|
SECTION Central Asian Scripts
|
|
Manichaean
|
|
Marchen
|
|
Mongolian
|
|
Mongolian Supplement
|
|
Old Sogdian
|
|
Old Turkic
|
|
Old Uyghur
|
|
Phags-pa
|
|
Sogdian
|
|
Soyombo
|
|
Tibetan
|
|
Zanabazar Square
|
|
|
|
SECTION South Asian Scripts
|
|
Ahom
|
|
Bengali
|
|
Bhaiksuki
|
|
Brahmi
|
|
Chakma
|
|
Devanagari
|
|
Devanagari Extended
|
|
Devanagari Extended-A
|
|
Dives Akuru
|
|
Dogra
|
|
Grantha
|
|
Gujarati
|
|
Gunjala Gondi
|
|
Gurmukhi
|
|
Kaithi
|
|
Kannada
|
|
Kharoshthi
|
|
Khojki
|
|
Khudawadi
|
|
Lepcha
|
|
Limbu
|
|
Mahajani
|
|
Malayalam
|
|
Masaram Gondi
|
|
Meetei Mayek
|
|
Meetei Mayek Extensions
|
|
Modi
|
|
Mro
|
|
Multani
|
|
Nag Mundari
|
|
Nandinagari
|
|
Newa
|
|
Ol Chiki
|
|
Oriya
|
|
Saurashtra
|
|
Sharada
|
|
Siddham
|
|
Sinhala
|
|
Sinhala Archaic Numbers
|
|
Sora Sompeng
|
|
Syloti Nagri
|
|
Takri
|
|
Tamil
|
|
Tamil Supplement
|
|
Telugu
|
|
Thaana
|
|
Tirhuta
|
|
Toto
|
|
Vedic Extensions
|
|
Wancho
|
|
Warang Citi
|
|
|
|
SECTION Southeast Asian Scripts
|
|
Cham
|
|
Hanifi Rohingya
|
|
Kayah Li
|
|
Khmer
|
|
Khmer Symbols
|
|
Lao
|
|
Myanmar
|
|
Myanmar Extended-A
|
|
Myanmar Extended-B
|
|
New Tai Lue
|
|
Nyiakeng Puachue Hmong
|
|
Pahawh Hmong
|
|
Pau Cin Hau
|
|
Tai Le
|
|
Tai Tham
|
|
Tai Viet
|
|
Tangsa
|
|
Thai
|
|
|
|
SECTION Indonesia & Oceania Scripts
|
|
Balinese
|
|
Batak
|
|
Buginese
|
|
Buhid
|
|
Hanunoo
|
|
Javanese
|
|
Kawi
|
|
Makasar
|
|
Rejang
|
|
Sundanese
|
|
Sundanese Supplement
|
|
Tagalog
|
|
Tagbanwa
|
|
|
|
SECTION East Asian Scripts
|
|
Bopomofo
|
|
Bopomofo Extended
|
|
CJK Unified Ideographs
|
|
CJK Unified Ideographs Extension A
|
|
CJK Unified Ideographs Extension B
|
|
CJK Unified Ideographs Extension C
|
|
CJK Unified Ideographs Extension D
|
|
CJK Unified Ideographs Extension E
|
|
CJK Unified Ideographs Extension F
|
|
CJK Unified Ideographs Extension G
|
|
CJK Unified Ideographs Extension H
|
|
CJK Compatibility Ideographs
|
|
CJK Compatibility Ideographs Supplement
|
|
Kangxi Radicals
|
|
CJK Radicals Supplement
|
|
CJK Strokes
|
|
Ideographic Description Characters
|
|
Hangul Jamo
|
|
Hangul Jamo Extended-A
|
|
Hangul Jamo Extended-B
|
|
Hangul Compatibility Jamo
|
|
Hangul Syllables
|
|
Hiragana
|
|
Kana Extended-A
|
|
Kana Extended-B
|
|
Kana Supplement
|
|
Small Kana Extension
|
|
Kanbun
|
|
Katakana
|
|
Katakana Phonetic Extensions
|
|
Khitan Small Script
|
|
Lisu
|
|
Lisu Supplement
|
|
Miao
|
|
Nushu
|
|
Tangut
|
|
Tangut Components
|
|
Tangut Supplement
|
|
Yi Syllables
|
|
Yi Radicals
|
|
|
|
SECTION American Scripts
|
|
Cherokee
|
|
Cherokee Supplement
|
|
Deseret
|
|
Osage
|
|
Unified Canadian Aboriginal Syllabics
|
|
Unified Canadian Aboriginal Syllabics Extended
|
|
Unified Canadian Aboriginal Syllabics Extended-A
|
|
|
|
SECTION Other
|
|
Alphabetic Presentation Forms
|
|
Halfwidth and Fullwidth Forms
|
|
|
|
SECTION Notational Systems
|
|
Braille Patterns
|
|
Musical Symbols
|
|
Ancient Greek Musical Notation
|
|
Byzantine Musical Symbols
|
|
Znamenny Musical Notation
|
|
Duployan
|
|
Shorthand Format Controls
|
|
Sutton SignWriting
|
|
|
|
SECTION Punctuation
|
|
General Punctuation
|
|
Supplemental Punctuation
|
|
CJK Symbols and Punctuation
|
|
Ideographic Symbols and Punctuation
|
|
CJK Compatibility Forms
|
|
Halfwidth and Fullwidth Forms
|
|
Small Form Variants
|
|
Vertical Forms
|
|
|
|
SECTION Alphanumeric Symbols
|
|
Letterlike Symbols
|
|
Mathematical Alphanumeric Symbols
|
|
Arabic Mathematical Alphabetic Symbols
|
|
Enclosed Alphanumerics
|
|
Enclosed Alphanumeric Supplement
|
|
Enclosed CJK Letters and Months
|
|
Enclosed Ideographic Supplement
|
|
CJK Compatibility
|
|
|
|
SECTION Technical Symbols
|
|
Control Pictures
|
|
Miscellaneous Technical
|
|
Optical Character Recognition
|
|
|
|
SECTION Numbers & Digits
|
|
Common Indic Number Forms
|
|
Coptic Epact Numbers
|
|
Counting Rod Numerals
|
|
Cuneiform Numbers and Punctuation
|
|
Indic Siyaq Numbers
|
|
Kaktovik Numerals
|
|
Mayan Numerals
|
|
Number Forms
|
|
Ottoman Siyaq Numbers
|
|
Rumi Numeral Symbols
|
|
Sinhala Archaic Numbers
|
|
|
|
SECTION Mathematical Symbols
|
|
Arrows
|
|
Supplemental Arrows-A
|
|
Supplemental Arrows-B
|
|
Supplemental Arrows-C
|
|
Miscellaneous Symbols and Arrows
|
|
Mathematical Alphanumeric Symbols
|
|
Arabic Mathematical Alphabetic Symbols
|
|
Letterlike Symbols
|
|
Mathematical Operators
|
|
Supplemental Mathematical Operators
|
|
Miscellaneous Mathematical Symbols-A
|
|
Miscellaneous Mathematical Symbols-B
|
|
Geometric Shapes
|
|
Box Drawing
|
|
Block Elements
|
|
Geometric Shapes Extended
|
|
|
|
SECTION Emoji & Pictographs
|
|
Dingbats
|
|
Ornamental Dingbats
|
|
Emoticons
|
|
Miscellaneous Symbols
|
|
Miscellaneous Symbols and Pictographs
|
|
Supplemental Symbols and Pictographs
|
|
Symbols and Pictographs Extended-A
|
|
Transport and Map Symbols
|
|
|
|
SECTION Other Symbols
|
|
Alchemical Symbols
|
|
Ancient Symbols
|
|
Currency Symbols
|
|
Chess Symbols
|
|
Domino Tiles
|
|
Mahjong Tiles
|
|
Playing Cards
|
|
Miscellaneous Symbols and Arrows
|
|
Symbols for Legacy Computing
|
|
Yijing Hexagram Symbols
|
|
Tai Xuan Jing Symbols
|
|
|
|
SECTION Specials
|
|
Specials
|
|
Tags
|
|
Variation Selectors
|
|
Variation Selectors Supplement
|
|
|
|
SECTION Private Use
|
|
Private Use Area
|
|
Supplementary Private Use Area-A
|
|
Supplementary Private Use Area-B
|
|
|
|
SECTION Surrogates
|
|
High Surrogates
|
|
High Private Use Surrogates
|
|
Low Surrogates
|
|
'''
|
|
# TODO: rename "Other Scripts" to "American Scripts"
|
|
|
|
categoryMap = { # same values as QChar::Category
|
|
"Mn": 1,
|
|
"Mc": 2,
|
|
"Me": 3,
|
|
"Nd": 4,
|
|
"Nl": 5,
|
|
"No": 6,
|
|
"Zs": 7,
|
|
"Zl": 8,
|
|
"Zp": 9,
|
|
"Cc": 10,
|
|
"Cf": 11,
|
|
"Cs": 12,
|
|
"Co": 13,
|
|
"Cn": 14,
|
|
"Lu": 15,
|
|
"Ll": 16,
|
|
"Lt": 17,
|
|
"Lm": 18,
|
|
"Lo": 19,
|
|
"Pc": 20,
|
|
"Pd": 21,
|
|
"Ps": 22,
|
|
"Pe": 23,
|
|
"Pi": 24,
|
|
"Pf": 25,
|
|
"Po": 26,
|
|
"Sm": 27,
|
|
"Sc": 28,
|
|
"Sk": 29,
|
|
"So": 30
|
|
}
|
|
|
|
|
|
class Names:
|
|
def __init__(self):
|
|
self.names = []
|
|
self.controlpos = -1
|
|
def addName(self, uni, name, category):
|
|
self.names.append([uni, name, category])
|
|
|
|
def calculateStringSize(self):
|
|
size = 0
|
|
hadcontrol = False
|
|
for entry in self.names:
|
|
if entry[1] == "<control>":
|
|
if not hadcontrol:
|
|
size += len(entry[1].encode('utf-8')) + 2
|
|
hadcontrol = True
|
|
else:
|
|
size += len(entry[1].encode('utf-8')) + 2
|
|
return size
|
|
|
|
def calculateOffsetSize(self):
|
|
return len(self.names)*8
|
|
|
|
def writeStrings(self, out, pos):
|
|
hadcontrol = False
|
|
for entry in self.names:
|
|
if entry[1] == "<control>":
|
|
if not hadcontrol:
|
|
out.write(pack("=b", entry[2]))
|
|
out.write(entry[1].encode('utf-8') + b"\0")
|
|
size = len(entry[1].encode('utf-8')) + 2
|
|
entry[1] = pos
|
|
self.controlpos = pos
|
|
pos += size
|
|
hadcontrol = True
|
|
else:
|
|
entry[1] = self.controlpos
|
|
else:
|
|
out.write(pack("=b", entry[2]))
|
|
out.write(entry[1].encode('utf-8') + b"\0")
|
|
size = len(entry[1].encode('utf-8')) + 2
|
|
entry[1] = pos
|
|
pos += size
|
|
return pos
|
|
|
|
def writeOffsets(self, out, pos):
|
|
for entry in self.names:
|
|
out.write(pack("=II", int(entry[0], 16), entry[1]))
|
|
pos += 8
|
|
return pos
|
|
|
|
class Details:
|
|
def __init__(self):
|
|
self.details = {}
|
|
def addEntry(self, char, category, text):
|
|
if char not in self.details:
|
|
self.details[char] = {}
|
|
if category not in self.details[char]:
|
|
self.details[char][category] = []
|
|
self.details[char][category].append(text)
|
|
|
|
def calculateStringSize(self):
|
|
size = 0
|
|
for char in self.details.values():
|
|
for cat in char.values():
|
|
for s in cat:
|
|
if type(s) is str:
|
|
size += len(s.encode('utf-8')) + 1
|
|
else:
|
|
size += 4
|
|
return size
|
|
|
|
def calculateOffsetSize(self):
|
|
return len(self.details)*29
|
|
|
|
def writeStrings(self, out, pos):
|
|
for char in self.details.values():
|
|
for cat in char.values():
|
|
for i in range(0, len(cat)):
|
|
s = cat[i]
|
|
if type(s) is str:
|
|
out.write(s.encode('utf-8') + b"\0")
|
|
size = len(s.encode('utf-8')) + 1
|
|
else:
|
|
out.write(pack("=I", s))
|
|
size = 4
|
|
cat[i] = pos
|
|
pos += size
|
|
return pos
|
|
|
|
def writeOffsets(self, out, pos):
|
|
for char in self.details.keys():
|
|
alias = 0
|
|
alias_count = 0
|
|
note = 0
|
|
note_count = 0
|
|
approxEquiv = 0
|
|
approxEquiv_count = 0
|
|
equiv = 0
|
|
equiv_count = 0
|
|
seeAlso = 0
|
|
seeAlso_count = 0
|
|
if "alias" in self.details[char]:
|
|
alias = self.details[char]["alias"][0]
|
|
alias_count = len(self.details[char]["alias"])
|
|
|
|
if "note" in self.details[char]:
|
|
note = self.details[char]["note"][0]
|
|
note_count = len(self.details[char]["note"])
|
|
|
|
if "approxEquiv" in self.details[char]:
|
|
approxEquiv = self.details[char]["approxEquiv"][0]
|
|
approxEquiv_count = len(self.details[char]["approxEquiv"])
|
|
|
|
if "equiv" in self.details[char]:
|
|
equiv = self.details[char]["equiv"][0]
|
|
equiv_count = len(self.details[char]["equiv"])
|
|
|
|
if "seeAlso" in self.details[char]:
|
|
seeAlso = self.details[char]["seeAlso"][0]
|
|
seeAlso_count = len(self.details[char]["seeAlso"])
|
|
|
|
out.write(pack("=IIbIbIbIbIb", char, alias, alias_count, note, note_count, approxEquiv, approxEquiv_count, equiv, equiv_count, seeAlso, seeAlso_count))
|
|
pos += 29
|
|
|
|
return pos
|
|
|
|
class SectionsBlocks:
|
|
def __init__(self):
|
|
self.sections = []
|
|
self.blocks = []
|
|
self.blockList = []
|
|
self.sectionList = []
|
|
|
|
def addBlock(self, begin, end, name):
|
|
self.blocks.append([begin, end, name])
|
|
self.blockList.append(name)
|
|
|
|
def addSection(self, section, block):
|
|
self.sections.append([section, block])
|
|
if not section in self.sectionList:
|
|
self.sectionList.append(section)
|
|
|
|
def calculateBlockStringSize(self):
|
|
size = 0
|
|
for block in self.blocks:
|
|
size += len(block[2].encode('utf-8')) + 1
|
|
return size
|
|
|
|
def calculateBlockOffsetSize(self):
|
|
return len(self.blocks) * 8
|
|
|
|
def calculateSectionStringSize(self):
|
|
size = 0
|
|
lastsection = ""
|
|
for section in self.sections:
|
|
if section[0] != lastsection:
|
|
size += len(section[0].encode('utf-8')) + 1
|
|
lastsection = section[0]
|
|
return size
|
|
|
|
def calculateSectionOffsetSize(self):
|
|
return len(self.sections) * 8
|
|
|
|
def writeBlockStrings(self, out, pos):
|
|
index = 0
|
|
for block in self.blocks:
|
|
out.write(block[2].encode('utf-8') + b"\0")
|
|
size = len(block[2].encode('utf-8')) + 1
|
|
found = False
|
|
for section in self.sections:
|
|
print(section)
|
|
if section[1] == block[2]:
|
|
print("found", section)
|
|
section[1] = int(index)
|
|
found = True
|
|
if not found:
|
|
print("Error: Did not find any category for block \""+block[2]+"\"")
|
|
sys.exit(1)
|
|
block[2] = index
|
|
pos += size
|
|
index += 1
|
|
return pos
|
|
|
|
def writeBlockOffsets(self, out, pos):
|
|
for block in self.blocks:
|
|
out.write(pack("=II", int(block[0], 16), int(block[1], 16)))
|
|
pos += 8
|
|
return pos
|
|
|
|
def writeSectionStrings(self, out, pos):
|
|
lastsection = ""
|
|
lastpos = 0
|
|
index = -1
|
|
for section in self.sections:
|
|
if section[0] != lastsection:
|
|
index += 1
|
|
lastsection = section[0]
|
|
out.write(section[0].encode('utf-8') + b"\0")
|
|
size = len(section[0].encode('utf-8')) + 1
|
|
section[0] = index
|
|
lastpos = pos
|
|
pos += size
|
|
else:
|
|
section[0] = index
|
|
return pos
|
|
|
|
def writeSectionOffsets(self, out, pos):
|
|
print(self.sections)
|
|
for section in self.sections:
|
|
out.write(pack("=II", section[0], section[1]))
|
|
pos += 8
|
|
return pos
|
|
|
|
def getBlockList(self):
|
|
return self.blockList
|
|
|
|
def getSectionList(self):
|
|
return self.sectionList
|
|
|
|
class Unihan:
|
|
def __init__(self):
|
|
self.unihan = {}
|
|
|
|
def addUnihan(self, uni, category, value):
|
|
uni = int(uni, 16)
|
|
if category != "kDefinition" and category != "kCantonese" and category != "kMandarin" and category != "kTang" and category != "kKorean" and category != "kJapaneseKun" and category != "kJapaneseOn":
|
|
return
|
|
if uni not in self.unihan:
|
|
self.unihan[uni] = [None, None, None, None, None, None, None]
|
|
if category == "kDefinition":
|
|
self.unihan[uni][0] = value
|
|
elif category == "kCantonese":
|
|
self.unihan[uni][1] = value
|
|
elif category == "kMandarin":
|
|
self.unihan[uni][2] = value
|
|
elif category == "kTang":
|
|
self.unihan[uni][3] = value
|
|
elif category == "kKorean":
|
|
self.unihan[uni][4] = value
|
|
elif category == "kJapaneseKun":
|
|
self.unihan[uni][5] = value
|
|
elif category == "kJapaneseOn":
|
|
self.unihan[uni][6] = value
|
|
|
|
def calculateStringSize(self):
|
|
size = 0
|
|
for char in self.unihan.keys():
|
|
for entry in self.unihan[char]:
|
|
if entry != None:
|
|
size += len(entry.encode('utf-8')) + 1
|
|
return size
|
|
|
|
def calculateOffsetSize(self):
|
|
return len(self.unihan) * 32
|
|
|
|
def writeStrings(self, out, pos):
|
|
for char in self.unihan.keys():
|
|
for i in range(0, 7):
|
|
if self.unihan[char][i] != None:
|
|
out.write(self.unihan[char][i].encode('utf-8') + b"\0")
|
|
size = len(self.unihan[char][i].encode('utf-8')) + 1
|
|
self.unihan[char][i] = pos
|
|
pos += size
|
|
return pos
|
|
|
|
def writeOffsets(self, out, pos):
|
|
for char in self.unihan.keys():
|
|
out.write(pack("=I", char))
|
|
for i in range(0, 7):
|
|
if self.unihan[char][i] != None:
|
|
out.write(pack("=I", self.unihan[char][i]))
|
|
else:
|
|
out.write(pack("=I", 0))
|
|
pos += 32
|
|
return pos
|
|
|
|
class Parser:
|
|
def parseUnicodeData(self, inUnicodeData, names):
|
|
regexp = re.compile(r'^([^;]+);([^;]+);([^;]+)')
|
|
for line in inUnicodeData:
|
|
line = line[:-1]
|
|
m = regexp.match(line)
|
|
if not m:
|
|
continue
|
|
uni = m.group(1)
|
|
name = m.group(2)
|
|
category = m.group(3)
|
|
if len(uni) > 8:
|
|
continue
|
|
names.addName(uni, name, categoryMap[category])
|
|
|
|
def parseDetails(self, inNamesList, details):
|
|
invalidRegexp = re.compile(r'^@')
|
|
unicodeRegexp = re.compile(r'^([0-9A-F]+)')
|
|
|
|
aliasRegexp = re.compile(r'^\s+=\s+(.+)$') #equal
|
|
seeAlsoRegexp = re.compile(r'^\s+x\s+.*([0-9A-F]{4,6})\)$') #ex
|
|
noteRegexp = re.compile(r'^\s+\*\s+(.+)$') #star
|
|
approxEquivalentRegexp = re.compile(r'^\s+#\s+(.+)$') #pound
|
|
equivalentRegexp = re.compile(r'^\s+:\s+(.+)$') #colon
|
|
|
|
drop = 0
|
|
currChar = 0
|
|
|
|
for line in inNamesList:
|
|
line = line[:-1]
|
|
m1 = unicodeRegexp.match(line)
|
|
m2 = aliasRegexp.match(line)
|
|
m3 = noteRegexp.match(line)
|
|
m4 = approxEquivalentRegexp.match(line)
|
|
m5 = equivalentRegexp.match(line)
|
|
m6 = seeAlsoRegexp.match(line)
|
|
if invalidRegexp.match(line):
|
|
continue
|
|
elif m1:
|
|
currChar = int(m1.group(1), 16)
|
|
if len(m1.group(1)) > 8: #limit to 32bit
|
|
drop = 1
|
|
continue
|
|
elif drop == 1:
|
|
continue
|
|
elif m2:
|
|
value = m2.group(1)
|
|
details.addEntry(currChar, "alias", value)
|
|
elif m3:
|
|
value = m3.group(1)
|
|
details.addEntry(currChar, "note", value)
|
|
elif m4:
|
|
value = m4.group(1)
|
|
details.addEntry(currChar, "approxEquiv", value)
|
|
elif m5:
|
|
value = m5.group(1)
|
|
details.addEntry(currChar, "equiv", value)
|
|
elif m6:
|
|
value = int(m6.group(1), 16)
|
|
details.addEntry(currChar, "seeAlso", value)
|
|
def parseBlocks(self, inBlocks, sectionsBlocks):
|
|
regexp = re.compile(r'^([0-9A-F]+)\.\.([0-9A-F]+); (.+)$')
|
|
for line in inBlocks:
|
|
line = line[:-1]
|
|
m = regexp.match(line)
|
|
if not m:
|
|
continue
|
|
if len(m.group(1)) > 8:
|
|
continue
|
|
sectionsBlocks.addBlock(m.group(1), m.group(2), m.group(3))
|
|
def parseSections(self, inSections, sectionsBlocks):
|
|
currSection = ""
|
|
for line in inSections:
|
|
line = line[:-1]
|
|
if len(line) == 0:
|
|
continue
|
|
temp = line.split(" ")
|
|
if temp[0] == "SECTION":
|
|
currSection = line[8:]
|
|
elif currSection != "" and line != "":
|
|
sectionsBlocks.addSection(currSection, line)
|
|
else:
|
|
print("error in data file")
|
|
sys.exit(1)
|
|
def parseUnihan(self, inUnihan, unihan):
|
|
regexp = re.compile(r'^U\+([0-9A-F]+)\s+([^\s]+)\s+(.+)$')
|
|
count = 0
|
|
for line in inUnihan:
|
|
if count % 100000 == 0:
|
|
print("\b.", end=' ')
|
|
sys.stdout.flush()
|
|
count += 1
|
|
line = line[:-1]
|
|
m = regexp.match(line)
|
|
if not m:
|
|
continue
|
|
if len(m.group(1)) <= 4:
|
|
unihan.addUnihan(m.group(1), m.group(2), m.group(3))
|
|
|
|
def writeTranslationDummy(out, data):
|
|
out.write(b"""\n\n""")
|
|
for group in data:
|
|
for entry in group[1]:
|
|
out.write(b"I18N_NOOP2(\""+group[0].encode('utf-8')+b"\", \""+entry.encode('utf-8')+b"\");\n")
|
|
|
|
out = open("kcharselect-data", "wb")
|
|
outTranslationDummy = open("kcharselect-translation.cpp", "wb")
|
|
|
|
inUnicodeData = open("UnicodeData.txt", "r")
|
|
inNamesList = open("NamesList.txt", "r")
|
|
inBlocks = open("Blocks.txt", "r")
|
|
inSections = io.StringIO(sectiondata)
|
|
inUnihan = open("Unihan_Readings.txt", "r")
|
|
|
|
if calcsize('=H') != 2 or calcsize('=I') != 4:
|
|
print("Error: Sizes of ushort and uint are not 16 and 32 bit as expected")
|
|
sys.exit(1)
|
|
|
|
names = Names()
|
|
details = Details()
|
|
sectionsBlocks = SectionsBlocks()
|
|
unihan = Unihan()
|
|
|
|
parser = Parser()
|
|
|
|
print("========== parsing files ===================")
|
|
parser.parseUnicodeData(inUnicodeData, names)
|
|
print(".", end=' ')
|
|
sys.stdout.flush()
|
|
parser.parseDetails(inNamesList, details)
|
|
print("\b.", end=' ')
|
|
sys.stdout.flush()
|
|
parser.parseBlocks(inBlocks, sectionsBlocks)
|
|
print("\b.", end=' ')
|
|
sys.stdout.flush()
|
|
parser.parseSections(inSections, sectionsBlocks)
|
|
print("\b.", end=' ')
|
|
sys.stdout.flush()
|
|
parser.parseUnihan(inUnihan, unihan)
|
|
print("\b.", end=' ')
|
|
sys.stdout.flush()
|
|
|
|
print("done.")
|
|
|
|
pos = 0
|
|
|
|
#write header, size: 40 bytes
|
|
print("========== writing header ==================")
|
|
out.write(pack("=I", 40))
|
|
print("names strings begin", 40)
|
|
|
|
namesOffsetBegin = names.calculateStringSize() + 40
|
|
out.write(pack("=I", namesOffsetBegin))
|
|
print("names offsets begin", namesOffsetBegin)
|
|
|
|
detailsStringBegin = namesOffsetBegin + names.calculateOffsetSize()
|
|
out.write(pack("=I", detailsStringBegin))
|
|
print("details strings begin", detailsStringBegin)
|
|
|
|
detailsOffsetBegin = detailsStringBegin + details.calculateStringSize()
|
|
out.write(pack("=I", detailsOffsetBegin))
|
|
print("details offsets begin", detailsOffsetBegin)
|
|
|
|
blocksStringBegin = detailsOffsetBegin + details.calculateOffsetSize()
|
|
out.write(pack("=I", blocksStringBegin))
|
|
print("block strings begin", blocksStringBegin)
|
|
|
|
blocksOffsetBegin = blocksStringBegin + sectionsBlocks.calculateBlockStringSize()
|
|
out.write(pack("=I", blocksOffsetBegin))
|
|
print("block offsets begin", blocksOffsetBegin)
|
|
|
|
sectionStringBegin = blocksOffsetBegin + sectionsBlocks.calculateBlockOffsetSize()
|
|
out.write(pack("=I", sectionStringBegin))
|
|
print("section strings begin", sectionStringBegin)
|
|
|
|
sectionOffsetBegin = sectionStringBegin + sectionsBlocks.calculateSectionStringSize()
|
|
out.write(pack("=I", sectionOffsetBegin))
|
|
print("section offsets begin", sectionOffsetBegin)
|
|
|
|
unihanStringBegin = sectionOffsetBegin + sectionsBlocks.calculateSectionOffsetSize()
|
|
out.write(pack("=I", unihanStringBegin))
|
|
print("unihan strings begin", unihanStringBegin)
|
|
|
|
unihanOffsetBegin = unihanStringBegin + unihan.calculateStringSize()
|
|
out.write(pack("=I", unihanOffsetBegin))
|
|
print("unihan offsets begin", unihanOffsetBegin)
|
|
|
|
end = unihanOffsetBegin + unihan.calculateOffsetSize()
|
|
print("end should be", end)
|
|
|
|
pos += 40
|
|
|
|
print("========== writing data ====================")
|
|
|
|
pos = names.writeStrings(out, pos)
|
|
print("names strings written, position", pos)
|
|
pos = names.writeOffsets(out, pos)
|
|
print("names offsets written, position", pos)
|
|
pos = details.writeStrings(out, pos)
|
|
print("details strings written, position", pos)
|
|
pos = details.writeOffsets(out, pos)
|
|
print("details offsets written, position", pos)
|
|
pos = sectionsBlocks.writeBlockStrings(out, pos)
|
|
print(sectionsBlocks.sections)
|
|
print("block strings written, position", pos)
|
|
pos = sectionsBlocks.writeBlockOffsets(out, pos)
|
|
print("block offsets written, position", pos)
|
|
pos = sectionsBlocks.writeSectionStrings(out, pos)
|
|
print("section strings written, position", pos)
|
|
pos = sectionsBlocks.writeSectionOffsets(out, pos)
|
|
print("section offsets written, position", pos)
|
|
pos = unihan.writeStrings(out, pos)
|
|
print("unihan strings written, position", pos)
|
|
pos = unihan.writeOffsets(out, pos)
|
|
print("unihan offsets written, position", pos)
|
|
|
|
print("========== writing translation dummy ======")
|
|
translationData = [["KCharSelect section name", sectionsBlocks.getSectionList()], ["KCharselect unicode block name",sectionsBlocks.getBlockList()]]
|
|
writeTranslationDummy(outTranslationDummy, translationData)
|
|
print("done. make sure to copy both kcharselect-data and kcharselect-translation.cpp.")
|