Last updated on October 6th, 2024 at 08:35 pm
Here, We see UTF-8 Validation LeetCode Solution. This Leetcode problem is done in many programming languages like C++, Java, JavaScript, Python, etc. with different approaches.
List of all LeetCode Solution
Topics
Bit Manipulation
Companies
Level of Question
Medium
UTF-8 Validation LeetCode Solution
Table of Contents
Problem Statement
Given an integer array data representing the data, return whether it is a valid UTF-8 encoding (i.e. it translates to a sequence of valid UTF-8 encoded characters).
A character in UTF8 can be from 1 to 4 bytes long, subjected to the following rules:
- For a 1-byte character, the first bit is a 0, followed by its Unicode code.
- For an n-bytes character, the first
n
bits are all one’s, the n + 1 bit is 0, followed by n – 1 bytes with the most significant 2 bits being 10.
This is how the UTF-8 encoding would work: Number of Bytes | UTF-8 Octet Sequence | (binary) ——————–+—————————————– 1 | 0xxxxxxx 2 | 110xxxxx 10xxxxxx 3 | 1110xxxx 10xxxxxx 10xxxxxx 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
x
denotes a bit in the binary form of a byte that may be either 0 or 1.
Note: The input is an array of integers. Only the least significant 8 bits of each integer is used to store the data. This means each integer represents only 1 byte of data.
Example 1:
Input: data = [197,130,1]
Output: true
Explanation: data represents the octet sequence: 11000101 10000010 00000001. It is a valid utf-8 encoding for a 2-bytes character followed by a 1-byte character.
Example 2:
Input: data = [235,140,4]
Output: false
Explanation:
data represented the octet sequence: 11101011 10001100 00000100.
The first 3 bits are all one’s and the 4th bit is 0 means it is a 3-bytes character.
The next byte is a continuation byte which starts with 10 and that’s correct.
But the second continuation byte does not start with 10, so it is invalid.
1. UTF-8 Validation LeetCode Solution C++
class Solution { public: bool validUtf8(vector<int>& data) { int remaining = 0; for (auto& x : data) { if (remaining == 0) { if ((x >> 5) == 0b110) { remaining = 1; } else if ((x >> 4) == 0b1110) { remaining = 2; } else if ((x >> 3) == 0b11110) { remaining = 3; } else if ((x >> 7) != 0) { return false; } } else { if ((x >> 6) != 0b10) return false; else remaining--; } } return remaining == 0; } };
2. UTF-8 Validation LeetCode Solution Java
class Solution { public boolean validUtf8(int[] data) { if(data==null || data.length==0) return false; boolean isValid = true; for(int i=0;i<data.length;i++) { if(data[i]>255) return false; int numberOfBytes = 0; if((data[i] & 128) == 0) { numberOfBytes = 1; } else if((data[i] & 224) == 192) { numberOfBytes = 2; } else if((data[i] & 240) == 224) { numberOfBytes = 3; } else if((data[i] & 248) == 240) { numberOfBytes = 4; } else { return false; } for(int j=1;j<numberOfBytes;j++) { if(i+j>=data.length) return false; if((data[i+j] & 192) != 128) return false; } i=i+numberOfBytes-1; } return isValid; } }
3. UTF-8 Validation LeetCode Solution JavaScript
var validUtf8 = function(data) { let binary = data.map((i)=>{ let b = "00000000" + i.toString(2); return b.substring(b.length - 8); }) let current = 0; for (let i=0;i<binary.length;i++) { let bytes = binary[i].indexOf('0'); if (current==0) { if (bytes==0) continue; if (bytes > 4 || bytes < 2) return false; current = bytes; } else { if (bytes != 1) return false } current--; } return current==0; };
4. UTF-8 Validation Solution Python
class Solution(object): def validUtf8(self, data): n = len(data) i = 0 while i < n: valid_encoding = False if self.one_byte_encoding(data[i]): i += 1 valid_encoding = True for byte_len in range(2, 4 + 1): if self.byte_encoding(byte_len, data, i): i += byte_len valid_encoding = True break if not valid_encoding: return False return True def one_byte_encoding(self, number): if number & 1 << 7 == 0: return True return False def byte_encoding(self, byte_len, data, i): if i + byte_len > len(data): return False first_byte = data[i] for j in range(byte_len): if first_byte & 1<<(7-j) == 0: return False if first_byte & 1 << 7 - byte_len != 0: return False for j in range(i+1, i + 1 + (byte_len - 1)): if data[j] & 1<<7 == 0: return False if data[j] & 1<< 6 != 0: return False return True